diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index e3100fd603e..5e5b95ce826 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -6,7 +6,7 @@ on: # - 'raftstore-proxy*' pull_request: branches: - - 'raftstore-proxy*' + - 'ldz/*' jobs: build-check-old: diff --git a/Cargo.lock b/Cargo.lock index 44c286283c4..aa7eb276e28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,6 +84,7 @@ dependencies = [ "codec", "engine_traits", "kvproto", + "log_wrappers", "match-template", "panic_hook", "thiserror", @@ -137,6 +138,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "assert-type-eq" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd49a41856ee21a0cfb2b1cfbfcca0f1d3e6c257c38939f0d6ecfaf177f2ea47" + [[package]] name = "async-channel" version = "1.6.1" @@ -224,7 +231,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d962799a5863fdf06fbf594e04102130582d010379137e9a98a7e2e693a5885" dependencies = [ "error-code", - "libc 0.2.132", + "libc 0.2.139", "wasm-bindgen", "winapi 0.3.9", ] @@ -255,7 +262,7 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -445,7 +452,7 @@ dependencies = [ "addr2line", "cc", "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "miniz_oxide 0.4.4", "object", "rustc-demangle", @@ -579,9 +586,11 @@ dependencies = [ "derive_more", "fail", "file_system", + "kvproto", "lazy_static", "online_config", "prometheus", + "resource_control", "serde", "serde_derive", "slog", @@ -599,7 +608,7 @@ dependencies = [ "bcc-sys", "bitflags", "byteorder", - "libc 0.2.132", + "libc 0.2.139", "regex", "thiserror", ] @@ -735,7 +744,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", ] @@ -761,7 +770,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7f788eaf239475a3c1e1acf89951255a46c4b9b46cf3e866fc4d0707b4b9e36" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "valgrind_request", ] @@ -803,7 +812,7 @@ dependencies = [ "kvproto", "lazy_static", "log_wrappers", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", @@ -934,7 +943,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f54d78e30b388d4815220c8dd03fea5656b6c6d32adb59e89061552a102f8da1" dependencies = [ "glob", - "libc 0.2.132", + "libc 0.2.139", "libloading", ] @@ -988,7 +997,7 @@ dependencies = [ "byteorder", "bytes", "error_code", - "libc 0.2.132", + "libc 0.2.139", "panic_hook", "protobuf", "rand 0.8.5", @@ -1014,7 +1023,7 @@ dependencies = [ "fail", "futures 0.3.15", "kvproto", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "rand 0.8.5", "tikv_alloc", "tikv_util", @@ -1047,7 +1056,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" dependencies = [ "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -1071,7 +1080,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -1129,7 +1138,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63aaaf47e457badbcb376c65a49d0f182c317ebd97dc6d1ced94c8e1d09c0f3a" dependencies = [ "criterion", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -1186,18 +1195,6 @@ dependencies = [ "crossbeam-utils 0.8.11", ] -[[package]] -name = "crossbeam-epoch" -version = "0.9.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.3", - "lazy_static", - "memoffset", - "scopeguard", -] - [[package]] name = "crossbeam-epoch" version = "0.9.8" @@ -1208,7 +1205,7 @@ dependencies = [ "cfg-if 1.0.0", "crossbeam-utils 0.8.8", "lazy_static", - "memoffset", + "memoffset 0.6.4", "scopeguard", ] @@ -1220,7 +1217,7 @@ dependencies = [ "autocfg", "cfg-if 1.0.0", "crossbeam-utils 0.8.11", - "memoffset", + "memoffset 0.6.4", "once_cell", "scopeguard", ] @@ -1237,12 +1234,13 @@ dependencies = [ [[package]] name = "crossbeam-skiplist" -version = "0.0.0" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883a5821d7d079fcf34ac55f27a833ee61678110f6b97637cc74513c0d0b42fc" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.3", - "crossbeam-utils 0.8.3", + "crossbeam-epoch 0.9.8", + "crossbeam-utils 0.8.8", "scopeguard", ] @@ -1257,16 +1255,6 @@ dependencies = [ "lazy_static", ] -[[package]] -name = "crossbeam-utils" -version = "0.8.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" -dependencies = [ - "autocfg", - "cfg-if 1.0.0", - "lazy_static", -] - [[package]] name = "crossbeam-utils" version = "0.8.8" @@ -1361,7 +1349,7 @@ checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" dependencies = [ "cfg-if 1.0.0", "num_cpus", - "parking_lot 0.12.0", + "parking_lot 0.12.1", ] [[package]] @@ -1420,7 +1408,7 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "redox_users", "winapi 0.3.9", ] @@ -1623,8 +1611,9 @@ dependencies = [ "online_config", "ordered-float", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", + "portable-atomic", "prometheus", "prometheus-static-metric", "protobuf", @@ -1684,10 +1673,11 @@ dependencies = [ "keys", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log_wrappers", "num_cpus", "online_config", + "portable-atomic", "prometheus", "prometheus-static-metric", "protobuf", @@ -1801,7 +1791,7 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5115567ac25674e0043e472be13d14e537f37ea8aa4bdc4aef0c89add1db1ff" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "str-buf", ] @@ -1909,7 +1899,7 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "libloading", "matches", "nix 0.24.1", @@ -1965,11 +1955,11 @@ dependencies = [ "crossbeam-utils 0.8.8", "fs2", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "maligned", "online_config", "openssl", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "rand 0.8.5", @@ -1990,7 +1980,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "thiserror", "winapi 0.3.9", ] @@ -2002,7 +1992,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "winapi 0.3.9", ] @@ -2015,7 +2005,7 @@ checksum = "d691fdb3f817632d259d09220d4cf0991dbb2c9e59e044a02a59194bf6e14484" dependencies = [ "cc", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -2043,7 +2033,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2adaffba6388640136149e18ed080b77a78611c1e1d6de75aedcdf78df5d4682" dependencies = [ "crc32fast", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "miniz_oxide 0.3.7", ] @@ -2084,7 +2074,7 @@ name = "fs2" version = "0.4.3" source = "git+https://github.com/tabokie/fs2-rs?branch=tikv#cd503764a19a99d74c1ab424dd13d6bcd093fcae" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -2110,7 +2100,7 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f41b048a94555da0f42f1d632e2e19510084fb8e303b0daa2816e733fb3644a0" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2355,7 +2345,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "473a1265acc8ff1e808cd0a1af8cee3c2ee5200916058a2ca113c29f2d903571" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.132", + "libc 0.2.139", "wasi 0.7.0", ] @@ -2367,7 +2357,7 @@ checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ "cfg-if 1.0.0", "js-sys", - "libc 0.2.132", + "libc 0.2.139", "wasi 0.10.2+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2416,7 +2406,7 @@ dependencies = [ "futures-executor", "futures-util", "grpcio-sys", - "libc 0.2.132", + "libc 0.2.139", "log", "parking_lot 0.11.1", "protobuf", @@ -2453,7 +2443,7 @@ dependencies = [ "bindgen 0.59.2", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "openssl-sys", "pkg-config", @@ -2521,7 +2511,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "307c3c9f937f38e3534b1d6447ecf090cafcc9744e4a6360e8b037b2cf5af120" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2729,7 +2719,7 @@ checksum = "4816c66d2c8ae673df83366c18341538f234a26d65a9ecea5c348b453ac1d02f" dependencies = [ "bitflags", "inotify-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2738,7 +2728,7 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2750,6 +2740,27 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "int-enum" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff87d3cc4b79b4559e3c75068d64247284aceb6a038bd4bb38387f3f164476d" +dependencies = [ + "int-enum-impl", +] + +[[package]] +name = "int-enum-impl" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1f2f068675add1a3fc77f5f5ab2e29290c841ee34d151abc007bce902e5d34" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "into_other" version = "0.0.1" @@ -2765,7 +2776,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2820,7 +2831,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b1d42ef453b30b7387e113da1c83ab1605d90c5b4e0eb8e96d016ed3b8c160" dependencies = [ "getrandom 0.1.12", - "libc 0.2.132", + "libc 0.2.139", "log", ] @@ -2862,17 +2873,18 @@ dependencies = [ "panic_hook", "thiserror", "tikv_alloc", + "tikv_util", ] [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#e53d558bc6d7d8b7bb2d283cdf6dda52a2615632" +source = "git+https://github.com/pingcap/kvproto.git#2b853bed812556901846f42820b63d8a0d9c8d24" dependencies = [ "futures 0.3.15", "grpcio", "protobuf", - "protobuf-build", + "protobuf-build 0.13.0", "raft-proto", ] @@ -2960,9 +2972,9 @@ checksum = "e32a70cf75e5846d53a673923498228bbec6a8624708a9ea5645f075d6276122" [[package]] name = "libc" -version = "0.2.132" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libfuzzer-sys" @@ -3002,7 +3014,7 @@ dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libtitan_sys", "libz-sys", "lz4-sys", @@ -3020,7 +3032,7 @@ dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "lz4-sys", "snappy-sys", @@ -3034,7 +3046,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", "vcpkg", ] @@ -3090,7 +3102,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3145,7 +3157,7 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3154,7 +3166,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3164,7 +3176,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3176,6 +3188,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "memory_trace_macros" version = "0.1.0" @@ -3235,7 +3256,7 @@ dependencies = [ "fuchsia-zircon-sys", "iovec", "kernel32-sys", - "libc 0.2.132", + "libc 0.2.139", "log", "miow", "net2", @@ -3249,7 +3270,7 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "log", "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.42.0", @@ -3295,7 +3316,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1587ebb20a5b04738f16cffa7e2526f1b8496b84f92920facd518362ff1559eb" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3346,7 +3367,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8d96b2e1c8da3957d58100b09f102c6d9cfdfced01b7ec5a8974044bb09dbd4" dependencies = [ "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "openssl", "openssl-probe", @@ -3364,7 +3385,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3373,6 +3394,7 @@ name = "new-mock-engine-store" version = "0.0.1" dependencies = [ "api_version", + "assert-type-eq", "causal_ts", "collections", "concurrency_manager", @@ -3389,6 +3411,7 @@ dependencies = [ "futures 0.3.15", "grpcio", "grpcio-health", + "int-enum", "keys", "kvproto", "lazy_static", @@ -3400,6 +3423,7 @@ dependencies = [ "raftstore", "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "slog", @@ -3424,7 +3448,7 @@ dependencies = [ "bitflags", "cc", "cfg-if 0.1.10", - "libc 0.2.132", + "libc 0.2.139", "void", ] @@ -3437,8 +3461,8 @@ dependencies = [ "bitflags", "cc", "cfg-if 1.0.0", - "libc 0.2.132", - "memoffset", + "libc 0.2.139", + "memoffset 0.6.4", ] [[package]] @@ -3449,22 +3473,22 @@ checksum = "8f17df307904acd05aa8e32e97bb20f2a0df1728bbc2d771ae8f9a90463441e9" dependencies = [ "bitflags", "cfg-if 1.0.0", - "libc 0.2.132", - "memoffset", + "libc 0.2.139", + "memoffset 0.6.4", ] [[package]] name = "nix" -version = "0.25.0" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ - "autocfg", "bitflags", "cfg-if 1.0.0", - "libc 0.2.132", - "memoffset", + "libc 0.2.139", + "memoffset 0.7.1", "pin-utils", + "static_assertions", ] [[package]] @@ -3521,7 +3545,7 @@ dependencies = [ "fsevent", "fsevent-sys", "inotify", - "libc 0.2.132", + "libc 0.2.139", "mio 0.6.23", "mio-extras", "walkdir", @@ -3674,7 +3698,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3752,7 +3776,7 @@ dependencies = [ "bitflags", "cfg-if 1.0.0", "foreign-types", - "libc 0.2.132", + "libc 0.2.139", "once_cell", "openssl-macros", "openssl-sys", @@ -3792,7 +3816,7 @@ checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" dependencies = [ "autocfg", "cc", - "libc 0.2.132", + "libc 0.2.139", "openssl-src", "pkg-config", "vcpkg", @@ -3813,7 +3837,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3834,9 +3858,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", "parking_lot_core 0.9.1", @@ -3850,7 +3874,7 @@ checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" dependencies = [ "cfg-if 1.0.0", "instant", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "smallvec", "winapi 0.3.9", @@ -3863,7 +3887,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "smallvec", "windows-sys 0.32.0", @@ -3898,6 +3922,7 @@ dependencies = [ "log", "log_wrappers", "prometheus", + "prometheus-static-metric", "security", "semver 0.10.0", "serde", @@ -3939,7 +3964,7 @@ checksum = "b8f94885300e262ef461aa9fd1afbf7df3caf9e84e271a74925d1c6c8b24830f" dependencies = [ "bitflags", "byteorder", - "libc 0.2.132", + "libc 0.2.139", "mmap", "nom 4.2.3", "phf", @@ -4082,7 +4107,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d27361d7578b410d0eb5fe815c2b2105b01ab770a7c738cb9a231457a809fcc7" dependencies = [ "ipnetwork", - "libc 0.2.132", + "libc 0.2.139", "pnet_base", "pnet_sys", "winapi 0.2.8", @@ -4094,11 +4119,17 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82f881a6d75ac98c5541db6144682d1773bb14c6fc50c6ebac7086c8f7f23c29" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.2.8", "ws2_32-sys", ] +[[package]] +name = "portable-atomic" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26f6a7b87c2e435a3241addceeeff740ff8b7e76b74c13bf9acb17fa454ea00b" + [[package]] name = "pprof" version = "0.11.0" @@ -4109,11 +4140,11 @@ dependencies = [ "cfg-if 1.0.0", "findshlibs", "inferno", - "libc 0.2.132", + "libc 0.2.139", "log", "nix 0.24.1", "once_cell", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "protobuf", "protobuf-codegen-pure", "smallvec", @@ -4138,6 +4169,17 @@ dependencies = [ "syn", ] +[[package]] +name = "proc-macro-crate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eda0fc3b0fb7c975631757e14d9049da17374063edb6ebbcbc54d880d4fe94e9" +dependencies = [ + "once_cell", + "thiserror", + "toml", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -4193,7 +4235,7 @@ dependencies = [ "byteorder", "hex 0.4.2", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -4202,7 +4244,7 @@ version = "0.4.2" source = "git+https://github.com/tikv/procinfo-rs?rev=6599eb9dca74229b2c1fcc44118bef7eff127128#6599eb9dca74229b2c1fcc44118bef7eff127128" dependencies = [ "byteorder", - "libc 0.2.132", + "libc 0.2.139", "nom 2.2.1", "rustc_version 0.2.3", ] @@ -4226,7 +4268,7 @@ dependencies = [ "cfg-if 1.0.0", "fnv", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "memchr", "parking_lot 0.11.1", "protobuf", @@ -4347,6 +4389,18 @@ dependencies = [ "regex", ] +[[package]] +name = "protobuf-build" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb3c02f54ecaf12572c1a60dbdb36b1f8f713a16105881143f2be84cca5bbe3" +dependencies = [ + "bitflags", + "protobuf", + "protobuf-codegen", + "regex", +] + [[package]] name = "protobuf-codegen" version = "2.8.0" @@ -4385,6 +4439,7 @@ dependencies = [ "engine_rocks", "engine_rocks_helper", "engine_store_ffi", + "engine_tiflash", "engine_traits", "error_code", "fail", @@ -4399,7 +4454,7 @@ dependencies = [ "keys", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "mime", @@ -4417,6 +4472,7 @@ dependencies = [ "rand 0.8.5", "regex", "resolved_ts", + "resource_control", "resource_metering", "security", "serde", @@ -4476,7 +4532,7 @@ dependencies = [ "hyper", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log_wrappers", "more-asserts", "new-mock-engine-store", @@ -4546,7 +4602,7 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#2357cb22760719bcd107a90d1e64ef505bdb1e15" +source = "git+https://github.com/tikv/raft-rs?branch=master#f73766712a538c2f6eb135b455297ad6c03fc58d" dependencies = [ "bytes", "fxhash", @@ -4561,7 +4617,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#82f6da7b8dff1856483e8e72a59dda903fb2499b" +source = "git+https://github.com/tikv/raft-engine.git#33530112c3a4acaf8c50ca9d0470284109926296" dependencies = [ "byteorder", "crc32fast", @@ -4572,14 +4628,14 @@ dependencies = [ "hex 0.4.2", "if_chain", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "lz4-sys", "memmap2", - "nix 0.25.0", + "nix 0.26.2", "num-derive", "num-traits", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "protobuf", @@ -4594,11 +4650,11 @@ dependencies = [ [[package]] name = "raft-proto" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#2357cb22760719bcd107a90d1e64ef505bdb1e15" +source = "git+https://github.com/tikv/raft-rs?branch=master#f73766712a538c2f6eb135b455297ad6c03fc58d" dependencies = [ "bytes", "protobuf", - "protobuf-build", + "protobuf-build 0.14.0", ] [[package]] @@ -4666,7 +4722,7 @@ dependencies = [ "openssl", "ordered-float", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", @@ -4674,6 +4730,7 @@ dependencies = [ "raft", "raft-proto", "rand 0.8.5", + "resource_control", "resource_metering", "serde", "serde_derive", @@ -4723,7 +4780,7 @@ dependencies = [ "keys", "kvproto", "log_wrappers", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "protobuf", @@ -4731,6 +4788,7 @@ dependencies = [ "raft-proto", "raftstore", "rand 0.8.5", + "resource_control", "resource_metering", "slog", "slog-global", @@ -4753,7 +4811,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" dependencies = [ "fuchsia-cprng", - "libc 0.2.132", + "libc 0.2.139", "rand_core 0.3.1", "rdrand", "winapi 0.3.9", @@ -4766,7 +4824,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ "getrandom 0.1.12", - "libc 0.2.132", + "libc 0.2.139", "rand_chacha 0.2.1", "rand_core 0.5.1", "rand_hc", @@ -4778,7 +4836,7 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "rand_chacha 0.3.0", "rand_core 0.6.2", ] @@ -5038,6 +5096,31 @@ dependencies = [ "txn_types", ] +[[package]] +name = "resource_control" +version = "0.0.1" +dependencies = [ + "byteorder", + "crossbeam-skiplist", + "dashmap", + "fail", + "futures 0.3.15", + "kvproto", + "lazy_static", + "online_config", + "pd_client", + "pin-project", + "prometheus", + "protobuf", + "serde", + "slog", + "slog-global", + "test_pd", + "test_pd_client", + "tikv_util", + "yatp", +] + [[package]] name = "resource_metering" version = "0.0.1" @@ -5048,7 +5131,7 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "online_config", "pdqselect", @@ -5085,7 +5168,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b72b84d47e8ec5a4f2872e8262b8f8256c5be1c938a7d6d3a867a3ba8f722f74" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "once_cell", "spin", "untrusted", @@ -5098,7 +5181,7 @@ name = "rocksdb" version = "0.3.0" source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "librocksdb_sys", ] @@ -5346,7 +5429,7 @@ dependencies = [ "bitflags", "core-foundation", "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", "security-framework-sys", ] @@ -5357,7 +5440,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3676258fd3cfe2c9a0ec99ce3038798d847ce3e4bb17746373eb9f0f1ac16339" dependencies = [ "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -5553,7 +5636,7 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "pd_client", @@ -5565,6 +5648,7 @@ dependencies = [ "raftstore-v2", "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "serde_json", @@ -5613,7 +5697,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "106428d9d96840ecdec5208c13ab8a4e28c38da1e0ccf2909fb44e41b992f897" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "nix 0.11.1", ] @@ -5623,7 +5707,7 @@ version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "signal-hook-registry", ] @@ -5633,7 +5717,7 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -5755,7 +5839,7 @@ version = "0.1.0" source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" dependencies = [ "cmake", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", ] @@ -5783,7 +5867,7 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -5988,7 +6072,7 @@ checksum = "ade661fa5e048ada64ad7901713301c21d2dbc5b65ee7967de8826c111452960" dependencies = [ "cfg-if 1.0.0", "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", "ntapi", "once_cell", "rayon", @@ -6071,7 +6155,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "rand 0.8.5", "redox_syscall 0.2.11", "remove_dir_all", @@ -6155,11 +6239,14 @@ dependencies = [ "futures 0.3.15", "grpcio", "kvproto", + "log_wrappers", "pd_client", "security", "slog", "slog-global", "tikv_util", + "tokio", + "tokio-stream", ] [[package]] @@ -6213,6 +6300,7 @@ dependencies = [ "raftstore", "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "server", @@ -6310,7 +6398,7 @@ dependencies = [ "hyper", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log_wrappers", "more-asserts", "online_config", @@ -6326,6 +6414,7 @@ dependencies = [ "raftstore", "rand 0.8.5", "rand_xorshift", + "resource_control", "resource_metering", "security", "serde_json", @@ -6427,6 +6516,7 @@ name = "tidb_query_common" version = "0.0.1" dependencies = [ "anyhow", + "api_version", "async-trait", "byteorder", "derive_more", @@ -6448,6 +6538,7 @@ dependencies = [ name = "tidb_query_datatype" version = "0.0.1" dependencies = [ + "api_version", "base64", "bitfield", "bitflags", @@ -6488,6 +6579,7 @@ name = "tidb_query_executors" version = "0.0.1" dependencies = [ "anyhow", + "api_version", "async-trait", "codec", "collections", @@ -6601,7 +6693,7 @@ dependencies = [ "keys", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "libloading", "log", "log_wrappers", @@ -6617,7 +6709,7 @@ dependencies = [ "online_config", "openssl", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "paste", "pd_client", "pin-project", @@ -6635,6 +6727,7 @@ dependencies = [ "rand 0.7.3", "regex", "reqwest", + "resource_control", "resource_metering", "rev_lines", "seahash", @@ -6684,7 +6777,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e37706572f4b151dff7a0146e040804e9c26fe3a3118591112f05cf12a4216c1" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "paste", "tikv-jemalloc-sys", ] @@ -6697,7 +6790,7 @@ checksum = "aeab4310214fe0226df8bfeb893a291a58b19682e8a07e1e1d4483ad4200d315" dependencies = [ "cc", "fs_extra", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -6706,7 +6799,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20612db8a13a6c06d57ec83953694185a367e16945f66565e8028d2c0bd76979" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "tikv-jemalloc-sys", ] @@ -6716,7 +6809,7 @@ version = "0.1.0" dependencies = [ "fxhash", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "mimalloc", "snmalloc-rs", "tcmalloc", @@ -6774,6 +6867,7 @@ dependencies = [ "cpu-time", "crc32fast", "crossbeam", + "crossbeam-skiplist", "derive_more", "error_code", "fail", @@ -6784,7 +6878,7 @@ dependencies = [ "http", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "mnt", @@ -6795,6 +6889,7 @@ dependencies = [ "openssl", "page_size", "panic_hook", + "parking_lot_core 0.9.1", "pin-project", "procfs", "procinfo", @@ -6832,7 +6927,7 @@ version = "0.1.42" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.1.56", "winapi 0.3.9", ] @@ -6850,12 +6945,12 @@ dependencies = [ [[package]] name = "tipb" version = "0.0.1" -source = "git+https://github.com/pingcap/tipb.git#f3286471a05a4454a1071dd5f66ac7dbf6c79ba3" +source = "git+https://github.com/pingcap/tipb.git#c6b7a5a1623bb2766a502301ecc3ac8f98cc7c79" dependencies = [ "futures 0.3.15", "grpcio", "protobuf", - "protobuf-build", + "protobuf-build 0.13.0", ] [[package]] @@ -6875,11 +6970,11 @@ checksum = "a9e03c497dc955702ba729190dc4aac6f2a0ce97f913e5b1b5912fc5039d9099" dependencies = [ "autocfg", "bytes", - "libc 0.2.132", + "libc 0.2.139", "memchr", "mio 0.8.5", "num_cpus", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", @@ -7133,7 +7228,7 @@ dependencies = [ "crossbeam-utils 0.8.8", "kvproto", "lazy_static", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project", "prometheus", "slab", @@ -7261,7 +7356,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "055058552ca15c566082fc61da433ae678f78986a6f16957e33162d1b218792a" dependencies = [ "kernel32-sys", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.2.8", ] @@ -7450,7 +7545,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -7461,7 +7556,7 @@ checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" dependencies = [ "either", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -7656,9 +7751,11 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#39cb495953d40a7e846363c06090755c2eac65fa" +source = "git+https://github.com/tikv/yatp.git?branch=master#bcf431a2619c06ab7fa0c72073a0c775646c484f" dependencies = [ "crossbeam-deque", + "crossbeam-skiplist", + "crossbeam-utils 0.8.8", "dashmap", "fail", "lazy_static", @@ -7698,7 +7795,7 @@ version = "5.0.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "zstd-sys", ] @@ -7709,5 +7806,5 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", ] diff --git a/Cargo.toml b/Cargo.toml index 551630811df..6235d1d9c8a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -140,6 +140,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } rand = "0.7.3" regex = "1.3" +resource_control = { workspace = true } resource_metering = { workspace = true } rev_lines = "0.2.1" seahash = "4.1.0" @@ -196,6 +197,7 @@ prometheus = { git = "https://github.com/solotzg/rust-prometheus.git", rev = "b4 # TODO: remove this when new raft-rs is published. raft = { git = "https://github.com/tikv/raft-rs", branch = "master" } raft-proto = { git = "https://github.com/tikv/raft-rs", branch = "master" } + protobuf = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } protobuf-codegen = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } @@ -266,6 +268,7 @@ members = [ "components/raftstore", "components/raftstore-v2", "components/resolved_ts", + "components/resource_control", "components/resource_metering", "components/security", "components/server", @@ -344,6 +347,7 @@ raft_log_engine = { path = "components/raft_log_engine" } raftstore = { path = "components/raftstore", default-features = false } raftstore-v2 = { path = "components/raftstore-v2", default-features = false } resolved_ts = { path = "components/resolved_ts" } +resource_control = { path = "components/resource_control" } resource_metering = { path = "components/resource_metering" } security = { path = "components/security" } server = { path = "components/server" } diff --git a/Dockerfile b/Dockerfile index c4ad36dc6e7..aefa51b2222 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,11 @@ RUN ln -s /usr/bin/cmake3 /usr/bin/cmake ENV LIBRARY_PATH /usr/local/lib:$LIBRARY_PATH ENV LD_LIBRARY_PATH /usr/local/lib:$LD_LIBRARY_PATH +# Install protoc +RUN curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-x86_64.zip" +RUN unzip protoc-3.15.8-linux-x86_64.zip -d /usr/local/ +ENV PATH /usr/local/bin/:$PATH + # Install Rustup RUN curl https://sh.rustup.rs -sSf | sh -s -- --no-modify-path --default-toolchain none -y ENV PATH /root/.cargo/bin/:$PATH @@ -72,8 +77,7 @@ RUN mkdir -p ./cmd/tikv-ctl/src ./cmd/tikv-server/src && \ echo 'fn main() {}' > ./cmd/tikv-ctl/src/main.rs && \ echo 'fn main() {}' > ./cmd/tikv-server/src/main.rs && \ for cargotoml in $(find . -type f -name "Cargo.toml"); do \ - sed -i '/fuzz/d' ${cargotoml} && \ - sed -i '/profiler/d' ${cargotoml} ; \ + sed -i '/fuzz/d' ${cargotoml} ; \ done COPY Makefile ./ @@ -105,8 +109,9 @@ FROM pingcap/alpine-glibc COPY --from=builder /tikv/target/release/tikv-server /tikv-server COPY --from=builder /tikv/target/release/tikv-ctl /tikv-ctl +# FIXME: Figure out why libstdc++ is not staticly linked. RUN apk add --no-cache \ - curl + curl libstdc++ EXPOSE 20160 20180 diff --git a/Makefile b/Makefile index b7ffa452cb4..2e55ed72176 100644 --- a/Makefile +++ b/Makefile @@ -213,19 +213,26 @@ pre-format: unset-override @rustup component add rustfmt @cargo install --force -q cargo-sort +pre-format-fast: unset-override + @rustup component add rustfmt + @cargo install -q cargo-sort + ci_fmt_check: M="fmt" ./proxy_scripts/ci_check.sh ci_test: wget https://github.com/protocolbuffers/protobuf/releases/download/v3.8.0/protoc-3.8.0-linux-x86_64.zip unzip protoc-3.8.0-linux-x86_64.zip - PROTOC="`pwd`/bin/protoc" M="testold" ./proxy_scripts/ci_check.sh + # PROTOC="`pwd`/bin/protoc" M="testold" ./proxy_scripts/ci_check.sh PROTOC="`pwd`/bin/protoc" M="testnew" ./proxy_scripts/ci_check.sh make debug gen_proxy_ffi: pre-format ./gen-proxy-ffi.sh +gen_proxy_ffi_fast: pre-format-fast + ./gen-proxy-ffi.sh + format: pre-format @cargo fmt @cargo sort -w ./Cargo.toml ./*/Cargo.toml components/*/Cargo.toml cmd/*/Cargo.toml >/dev/null diff --git a/cmd/build.rs b/cmd/build.rs index 6d11a38f705..c19797d9227 100644 --- a/cmd/build.rs +++ b/cmd/build.rs @@ -32,7 +32,9 @@ fn link_sys_lib(lib: &str, tool: &cc::Tool) { } // remove lib prefix and .a postfix. let libname = &lib[3..lib.len() - 2]; - println!("cargo:rustc-link-lib=static:+whole-archive={}", &libname); + // Get around the issue "the linking modifiers `+bundle` and `+whole-archive` + // are not compatible with each other when generating rlibs" + println!("cargo:rustc-link-lib=static:-bundle,+whole-archive={}", &libname); println!( "cargo:rustc-link-search=native={}", path.parent().unwrap().display() diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 30cd7035bef..e4c7be98dba 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -686,7 +686,8 @@ fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - cfg.rocksdb.build_opt(&cfg.rocksdb.build_resources(env)) + let resource = cfg.rocksdb.build_resources(env); + cfg.rocksdb.build_opt(&resource, cfg.storage.engine) } fn run_ldb_command(args: Vec, cfg: &TikvConfig) { diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index 7362ca25ccc..c80607145bd 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -12,6 +12,7 @@ bitflags = "1.0.1" codec = { workspace = true } engine_traits = { workspace = true } kvproto = { workspace = true } +log_wrappers = { workspace = true } match-template = "0.0.1" thiserror = "1.0" tikv_alloc = { workspace = true } diff --git a/components/api_version/src/keyspace.rs b/components/api_version/src/keyspace.rs new file mode 100644 index 00000000000..4b263822a1b --- /dev/null +++ b/components/api_version/src/keyspace.rs @@ -0,0 +1,163 @@ +use std::fmt::Debug; + +use engine_traits::{Error, Result}; +use tikv_util::box_err; + +use super::*; + +const KEYSPACE_PREFIX_LEN: usize = 4; + +pub trait KvPair { + fn key(&self) -> &[u8]; + fn value(&self) -> &[u8]; + fn kv(&self) -> (&[u8], &[u8]) { + (self.key(), self.value()) + } +} + +impl KvPair for (Vec, Vec) { + fn key(&self) -> &[u8] { + &self.0 + } + fn value(&self) -> &[u8] { + &self.1 + } +} + +pub trait Keyspace { + type KvPair: KvPair = (Vec, Vec); + fn make_kv_pair(p: (Vec, Vec)) -> Result; + fn parse_keyspace(key: &[u8]) -> Result<(Option, &[u8])> { + Ok((None, key)) + } +} + +#[derive(PartialEq, Clone, Copy, Debug)] +pub struct KeyspaceId(u32); + +impl From for KeyspaceId { + fn from(id: u32) -> Self { + Self(id) + } +} + +impl Keyspace for ApiV1 { + fn make_kv_pair(p: (Vec, Vec)) -> Result { + Ok(p) + } +} + +impl Keyspace for ApiV1Ttl { + fn make_kv_pair(p: (Vec, Vec)) -> Result { + Ok(p) + } +} + +impl Keyspace for ApiV2 { + type KvPair = KeyspaceKv; + + fn make_kv_pair(p: (Vec, Vec)) -> Result { + let (k, v) = p; + let (keyspace, _) = Self::parse_keyspace(&k)?; + Ok(KeyspaceKv { + k, + v, + keyspace: keyspace.unwrap(), + }) + } + + fn parse_keyspace(key: &[u8]) -> Result<(Option, &[u8])> { + let mode = ApiV2::parse_key_mode(key); + if key.len() < KEYSPACE_PREFIX_LEN || (mode != KeyMode::Raw && mode != KeyMode::Txn) { + return Err(Error::Other(box_err!( + "invalid API V2 key: {}", + log_wrappers::Value(key) + ))); + } + let id = u32::from_be_bytes([0, key[1], key[2], key[3]]); + Ok((Some(KeyspaceId::from(id)), &key[KEYSPACE_PREFIX_LEN..])) + } +} + +pub struct KeyspaceKv { + k: Vec, + v: Vec, + keyspace: KeyspaceId, +} + +impl KvPair for KeyspaceKv { + fn key(&self) -> &[u8] { + &self.k[KEYSPACE_PREFIX_LEN..] + } + + fn value(&self) -> &[u8] { + &self.v + } +} + +impl KeyspaceKv { + pub fn keyspace(&self) -> KeyspaceId { + self.keyspace + } +} + +impl PartialEq<(Vec, Vec)> for KeyspaceKv { + fn eq(&self, other: &(Vec, Vec)) -> bool { + self.kv() == (&other.0, &other.1) + } +} + +impl PartialEq for KeyspaceKv { + fn eq(&self, other: &Self) -> bool { + self.k == other.k && self.v == other.v + } +} + +impl Debug for KeyspaceKv { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KeyspaceKv") + .field("key", &log_wrappers::Value(self.key())) + .field("value", &log_wrappers::Value(self.value())) + .field("keyspace", &self.keyspace()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_v1_parse_keyspace() { + let k = b"t123_111"; + let (keyspace, key) = ApiV1::parse_keyspace(k).unwrap(); + assert_eq!(None, keyspace); + assert_eq!(k, key); + + let (keyspace, key) = ApiV1Ttl::parse_keyspace(k).unwrap(); + assert_eq!(None, keyspace); + assert_eq!(k, key); + } + + #[test] + fn test_v2_parse_keyspace() { + let ok = vec![ + (b"x\x00\x00\x01t123_114", 1, b"t123_114"), + (b"r\x00\x00\x01t123_112", 1, b"t123_112"), + (b"x\x01\x00\x00t213_112", 0x010000, b"t213_112"), + (b"r\x01\x00\x00t123_113", 0x010000, b"t123_113"), + ]; + + for (key, id, user_key) in ok { + let (keyspace, key) = ApiV2::parse_keyspace(key).unwrap(); + assert_eq!(Some(KeyspaceId::from(id)), keyspace); + assert_eq!(user_key, key); + } + + let err: Vec<&[u8]> = vec![b"t123_111", b"s\x00\x00", b"r\x00\x00"]; + + for key in err { + ApiV2::parse_keyspace(key).unwrap_err(); + } + } +} diff --git a/components/api_version/src/lib.rs b/components/api_version/src/lib.rs index 0c9ae388917..879751e7b62 100644 --- a/components/api_version/src/lib.rs +++ b/components/api_version/src/lib.rs @@ -1,17 +1,21 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. #![feature(min_specialization)] +#![feature(associated_type_defaults)] mod api_v1; mod api_v1ttl; pub mod api_v2; +pub mod keyspace; use engine_traits::Result; use kvproto::kvrpcpb::ApiVersion; pub use match_template::match_template; use txn_types::{Key, TimeStamp}; -pub trait KvFormat: Clone + Copy + 'static + Send + Sync { +use crate::keyspace::Keyspace; + +pub trait KvFormat: Keyspace + Clone + Copy + 'static + Send + Sync { const TAG: ApiVersion; /// Corresponding TAG of client requests. For test only. #[cfg(any(test, feature = "testexport"))] diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index b712a23973d..37ffbad37c4 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -1,15 +1,20 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, SystemTime}, +}; use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; use openssl::x509::verify::X509VerifyFlags; +use security::SecurityManager; use tikv_util::{ info, stream::{RetryError, RetryExt}, + warn, }; -use tokio::sync::OnceCell; +use tokio::sync::Mutex as AsyncMutex; use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; use crate::errors::{ContextualResultExt, Result}; @@ -17,20 +22,34 @@ use crate::errors::{ContextualResultExt, Result}; const RPC_TIMEOUT: Duration = Duration::from_secs(30); #[derive(Clone)] -pub struct LazyEtcdClient(Arc); +pub struct LazyEtcdClient(Arc>); -#[derive(Debug)] +#[derive(Clone)] pub struct ConnectionConfig { - pub tls: Option, + pub tls: Arc, pub keep_alive_interval: Duration, pub keep_alive_timeout: Duration, } +impl std::fmt::Debug for ConnectionConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConnectionConfig") + .field("keep_alive_interval", &self.keep_alive_interval) + .field("keep_alive_timeout", &self.keep_alive_timeout) + .finish() + } +} + impl ConnectionConfig { /// Convert the config to the connection option. fn to_connection_options(&self) -> ConnectOptions { let mut opts = ConnectOptions::new(); - if let Some(tls) = &self.tls { + if let Some(tls) = &self + .tls + .client_suite() + .map_err(|err| warn!("failed to load client suite!"; "err" => %err)) + .ok() + { opts = opts.with_openssl_tls( OpenSslClientConfig::default() .ca_cert_pem(&tls.ca) @@ -54,28 +73,27 @@ impl ConnectionConfig { impl LazyEtcdClient { pub fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { - Self(Arc::new(LazyEtcdClientInner { - opt: conf.to_connection_options(), + Self(Arc::new(AsyncMutex::new(LazyEtcdClientInner { + conf, endpoints: endpoints.iter().map(ToString::to_string).collect(), - cli: OnceCell::new(), - })) + last_modified: None, + cli: None, + }))) } -} - -impl std::ops::Deref for LazyEtcdClient { - type Target = LazyEtcdClientInner; - fn deref(&self) -> &Self::Target { - Arc::deref(&self.0) + async fn get_cli(&self) -> Result { + let mut l = self.0.lock().await; + l.get_cli().await.cloned() } } #[derive(Clone)] pub struct LazyEtcdClientInner { - opt: ConnectOptions, + conf: ConnectionConfig, endpoints: Vec, - cli: OnceCell, + last_modified: Option, + cli: Option, } fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { @@ -130,23 +148,34 @@ where } impl LazyEtcdClientInner { - async fn connect(&self) -> Result { + async fn connect(&mut self) -> Result<&EtcdStore> { let store = retry(|| { // For now, the interface of the `etcd_client` doesn't us to control // how to create channels when connecting, hence we cannot update the tls config - // at runtime. - // TODO: maybe add some method like `with_channel` for `etcd_client`, and adapt - // the `SecurityManager` API, instead of doing everything by own. - etcd_client::Client::connect(self.endpoints.clone(), Some(self.opt.clone())) + // at runtime, now what we did is manually check that each time we are getting + // the clients. + etcd_client::Client::connect( + self.endpoints.clone(), + Some(self.conf.to_connection_options()), + ) }) .await .context("during connecting to the etcd")?; - Ok(EtcdStore::from(store)) + let store = EtcdStore::from(store); + self.cli = Some(store); + Ok(self.cli.as_ref().unwrap()) } - pub async fn get_cli(&self) -> Result<&EtcdStore> { - let store = self.cli.get_or_try_init(|| self.connect()).await?; - Ok(store) + pub async fn get_cli(&mut self) -> Result<&EtcdStore> { + let modified = self.conf.tls.get_config().is_modified(&mut self.last_modified) + // Don't reload once we cannot check whether it is modified. + // Because when TLS disabled, this would always fail. + .unwrap_or(false); + if !modified && self.cli.is_some() { + return Ok(self.cli.as_ref().unwrap()); + } + info!("log backup reconnecting to the etcd service."; "tls_modified" => %modified, "connected_before" => %self.cli.is_some()); + self.connect().await } } @@ -155,7 +184,7 @@ impl MetaStore for LazyEtcdClient { type Snap = EtcdSnapshot; async fn snapshot(&self) -> Result { - self.0.get_cli().await?.snapshot().await + self.get_cli().await?.snapshot().await } async fn watch( @@ -163,14 +192,14 @@ impl MetaStore for LazyEtcdClient { keys: super::Keys, start_rev: i64, ) -> Result { - self.0.get_cli().await?.watch(keys, start_rev).await + self.get_cli().await?.watch(keys, start_rev).await } async fn txn(&self, txn: super::Transaction) -> Result<()> { - self.0.get_cli().await?.txn(txn).await + self.get_cli().await?.txn(txn).await } async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { - self.0.get_cli().await?.txn_cond(txn).await + self.get_cli().await?.txn_cond(txn).await } } diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index 7fe5798f833..75a0230c188 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -13,9 +13,11 @@ crossbeam = "0.8" derive_more = { version = "0.99", optional = true } fail = "0.5" file_system = { workspace = true } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" online_config = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +resource_control = { workspace = true } serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/batch-system/benches/batch-system.rs b/components/batch-system/benches/batch-system.rs index c248eabaf04..9edf72f0ff9 100644 --- a/components/batch-system/benches/batch-system.rs +++ b/components/batch-system/benches/batch-system.rs @@ -20,7 +20,7 @@ fn end_hook(tx: &std::sync::mpsc::Sender<()>) -> Message { fn bench_spawn_many(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); const ID_LIMIT: u64 = 32; const MESSAGE_LIMIT: usize = 256; @@ -55,7 +55,7 @@ fn bench_spawn_many(c: &mut Criterion) { fn bench_imbalance(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); const ID_LIMIT: u64 = 10; const MESSAGE_LIMIT: usize = 512; @@ -92,7 +92,7 @@ fn bench_imbalance(c: &mut Criterion) { fn bench_fairness(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); let state_cnt = Arc::new(AtomicUsize::new(0)); for id in 0..10 { diff --git a/components/batch-system/benches/router.rs b/components/batch-system/benches/router.rs index 3dd7e282e15..e25ee58b94d 100644 --- a/components/batch-system/benches/router.rs +++ b/components/batch-system/benches/router.rs @@ -8,7 +8,7 @@ use criterion::*; fn bench_send(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); let (normal_tx, normal_fsm) = Runner::new(100000); let normal_box = BasicMailbox::new(normal_tx, normal_fsm, Arc::default()); diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index 4d935ad4819..48ef809d421 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -15,15 +15,16 @@ use std::{ time::Duration, }; -use crossbeam::channel::{self, SendError}; use fail::fail_point; use file_system::{set_io_type, IoType}; +use resource_control::ResourceController; use tikv_util::{ debug, error, info, mpsc, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, - time::Instant, warn, + time::Instant, }; use crate::{ + channel::{fsm_channel, ControlScheduler, FsmReceiver, FsmSender, NormalScheduler}, config::Config, fsm::{Fsm, FsmScheduler, Priority}, mailbox::BasicMailbox, @@ -37,60 +38,6 @@ pub enum FsmTypes { // Used as a signal that scheduler should be shutdown. Empty, } - -// A macro to introduce common definition of scheduler. -macro_rules! impl_sched { - ($name:ident, $ty:path,Fsm = $fsm:tt) => { - pub struct $name { - sender: channel::Sender>, - low_sender: channel::Sender>, - } - - impl Clone for $name { - #[inline] - fn clone(&self) -> $name { - $name { - sender: self.sender.clone(), - low_sender: self.low_sender.clone(), - } - } - } - - impl FsmScheduler for $name - where - $fsm: Fsm, - { - type Fsm = $fsm; - - #[inline] - fn schedule(&self, fsm: Box) { - let sender = match fsm.get_priority() { - Priority::Normal => &self.sender, - Priority::Low => &self.low_sender, - }; - match sender.send($ty(fsm)) { - Ok(()) => {} - // TODO: use debug instead. - Err(SendError($ty(fsm))) => warn!("failed to schedule fsm {:p}", fsm), - _ => unreachable!(), - } - } - - fn shutdown(&self) { - // TODO: close it explicitly once it's supported. - // Magic number, actually any number greater than poll pool size works. - for _ in 0..256 { - let _ = self.sender.send(FsmTypes::Empty); - let _ = self.low_sender.send(FsmTypes::Empty); - } - } - } - }; -} - -impl_sched!(NormalScheduler, FsmTypes::Normal, Fsm = N); -impl_sched!(ControlScheduler, FsmTypes::Control, Fsm = C); - pub struct NormalFsm { fsm: Box, timer: Instant, @@ -168,7 +115,7 @@ impl Batch { /// /// When pending messages of the FSM is different than `expected_len`, /// attempts to schedule it in this poller again. Returns the `fsm` if the - /// re-scheduling suceeds. + /// re-scheduling succeeds. fn release(&mut self, mut fsm: NormalFsm, expected_len: usize) -> Option> { let mailbox = fsm.take_mailbox().unwrap(); mailbox.release(fsm.fsm); @@ -341,7 +288,7 @@ pub trait PollHandler: Send + 'static { /// Internal poller that fetches batch and call handler hooks for readiness. pub struct Poller { pub router: Router, ControlScheduler>, - pub fsm_receiver: channel::Receiver>, + pub fsm_receiver: FsmReceiver, pub handler: Handler, pub max_batch_size: usize, pub reschedule_duration: Duration, @@ -534,8 +481,8 @@ pub trait HandlerBuilder { pub struct BatchSystem { name_prefix: Option, router: BatchRouter, - receiver: channel::Receiver>, - low_receiver: channel::Receiver>, + receiver: FsmReceiver, + low_receiver: FsmReceiver, pool_size: usize, max_batch_size: usize, workers: Arc>>>, @@ -649,15 +596,15 @@ where } } -struct PoolStateBuilder { +struct PoolStateBuilder { max_batch_size: usize, reschedule_duration: Duration, - fsm_receiver: channel::Receiver>, - fsm_sender: channel::Sender>, + fsm_receiver: FsmReceiver, + fsm_sender: FsmSender, pool_size: usize, } -impl PoolStateBuilder { +impl PoolStateBuilder { fn build>( self, name_prefix: String, @@ -683,11 +630,11 @@ impl PoolStateBuilder { } } -pub struct PoolState> { +pub struct PoolState> { pub name_prefix: String, pub handler_builder: H, - pub fsm_receiver: channel::Receiver>, - pub fsm_sender: channel::Sender>, + pub fsm_receiver: FsmReceiver, + pub fsm_sender: FsmSender, pub low_priority_pool_size: usize, pub expected_pool_size: usize, pub workers: Arc>>>, @@ -707,32 +654,32 @@ pub fn create_system( cfg: &Config, sender: mpsc::LooseBoundedSender, controller: Box, + resource_ctl: Option>, ) -> (BatchRouter, BatchSystem) { let state_cnt = Arc::new(AtomicUsize::new(0)); let control_box = BasicMailbox::new(sender, controller, state_cnt.clone()); - let (tx, rx) = channel::unbounded(); - let (tx2, rx2) = channel::unbounded(); + let (sender, receiver) = fsm_channel(resource_ctl); + let (low_sender, low_receiver) = fsm_channel(None); // no resource control for low fsm let normal_scheduler = NormalScheduler { - sender: tx.clone(), - low_sender: tx2.clone(), + sender: sender.clone(), + low_sender, }; let control_scheduler = ControlScheduler { - sender: tx.clone(), - low_sender: tx2, + sender: sender.clone(), }; let pool_state_builder = PoolStateBuilder { max_batch_size: cfg.max_batch_size(), reschedule_duration: cfg.reschedule_duration.0, - fsm_receiver: rx.clone(), - fsm_sender: tx, + fsm_receiver: receiver.clone(), + fsm_sender: sender, pool_size: cfg.pool_size, }; let router = Router::new(control_box, normal_scheduler, control_scheduler, state_cnt); let system = BatchSystem { name_prefix: None, router: router.clone(), - receiver: rx, - low_receiver: rx2, + receiver, + low_receiver, pool_size: cfg.pool_size, max_batch_size: cfg.max_batch_size(), workers: Arc::new(Mutex::new(Vec::new())), diff --git a/components/batch-system/src/channel.rs b/components/batch-system/src/channel.rs new file mode 100644 index 00000000000..094b6a7a2ae --- /dev/null +++ b/components/batch-system/src/channel.rs @@ -0,0 +1,252 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cell::RefCell, sync::Arc}; + +use crossbeam::channel::{self, RecvError, SendError, TryRecvError, TrySendError}; +use kvproto::kvrpcpb::CommandPri; +use resource_control::{ResourceConsumeType, ResourceController}; +use tikv_util::{mpsc::priority_queue, warn}; + +use crate::{ + fsm::{Fsm, FsmScheduler, Priority, ResourceMetered}, + FsmTypes, +}; + +pub fn fsm_channel( + resource_ctl: Option>, +) -> (FsmSender, FsmReceiver) { + if let Some(ctl) = resource_ctl { + let (tx, rx) = priority_queue::unbounded(); + ( + FsmSender::Priority { + resource_ctl: ctl, + sender: tx, + last_msg_group: RefCell::new(String::new()), + }, + FsmReceiver::Priority(rx), + ) + } else { + let (tx, rx) = channel::unbounded(); + (FsmSender::Vanilla(tx), FsmReceiver::Vanilla(rx)) + } +} + +pub struct NormalScheduler { + pub(crate) sender: FsmSender, + pub(crate) low_sender: FsmSender, +} + +impl Clone for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + NormalScheduler { + sender: self.sender.clone(), + low_sender: self.low_sender.clone(), + } + } +} + +impl FsmScheduler for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = N; + + fn consume_msg_resource(&self, msg: &::Message) { + self.sender.consume_msg_resource(msg); + } + + #[inline] + fn schedule(&self, fsm: Box) { + let sender = match fsm.get_priority() { + Priority::Normal => &self.sender, + Priority::Low => &self.low_sender, + }; + + match sender.send(FsmTypes::Normal(fsm)) { + Ok(()) => {} + Err(SendError(FsmTypes::Normal(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty); + let _ = self.low_sender.send(FsmTypes::Empty); + } + } +} + +pub struct ControlScheduler { + pub(crate) sender: FsmSender, +} + +impl Clone for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + ControlScheduler { + sender: self.sender.clone(), + } + } +} + +impl FsmScheduler for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = C; + + fn consume_msg_resource(&self, _msg: &::Message) {} + + #[inline] + fn schedule(&self, fsm: Box) { + match self.sender.send(FsmTypes::Control(fsm)) { + Ok(()) => {} + Err(SendError(FsmTypes::Control(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty); + } + } +} + +pub enum FsmSender { + Vanilla(channel::Sender>), + Priority { + resource_ctl: Arc, + sender: priority_queue::Sender>, + last_msg_group: RefCell, + }, +} + +impl Clone for FsmSender +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + match self { + FsmSender::Vanilla(sender) => FsmSender::Vanilla(sender.clone()), + FsmSender::Priority { + resource_ctl, + sender, + .. + } => FsmSender::Priority { + resource_ctl: resource_ctl.clone(), + sender: sender.clone(), + last_msg_group: RefCell::new(String::new()), + }, + } + } +} + +impl FsmSender { + pub fn send(&self, fsm: FsmTypes) -> Result<(), SendError>> { + match self { + FsmSender::Vanilla(sender) => sender.send(fsm), + FsmSender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + // TODO: pass different priority + let pri = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + sender.send(fsm, pri) + } + } + } + + pub fn try_send(&self, fsm: FsmTypes) -> Result<(), TrySendError>> { + match self { + FsmSender::Vanilla(sender) => sender.try_send(fsm), + FsmSender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + let priority = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + sender.try_send(fsm, priority) + } + } + } + + fn consume_msg_resource(&self, msg: &N::Message) { + match self { + FsmSender::Vanilla(_) => {} + FsmSender::Priority { + resource_ctl, + last_msg_group, + .. + } => { + if let Some(mut groups) = msg.get_resource_consumptions() { + let mut dominant_group = "".to_owned(); + let mut max_write_bytes = 0; + for (group_name, write_bytes) in groups.drain() { + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + if write_bytes > max_write_bytes { + dominant_group = group_name; + max_write_bytes = write_bytes; + } + } + *last_msg_group.borrow_mut() = dominant_group; + } + } + } + } +} + +pub enum FsmReceiver { + Vanilla(channel::Receiver>), + Priority(priority_queue::Receiver>), +} + +impl Clone for FsmReceiver +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + match self { + FsmReceiver::Vanilla(receiver) => FsmReceiver::Vanilla(receiver.clone()), + FsmReceiver::Priority(receiver) => FsmReceiver::Priority(receiver.clone()), + } + } +} + +impl FsmReceiver { + pub fn recv(&self) -> Result, RecvError> { + match self { + FsmReceiver::Vanilla(receiver) => receiver.recv(), + FsmReceiver::Priority(receiver) => receiver.recv(), + } + } + + pub fn try_recv(&self) -> Result, TryRecvError> { + match self { + FsmReceiver::Vanilla(receiver) => receiver.try_recv(), + FsmReceiver::Priority(receiver) => receiver.try_recv(), + } + } +} diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index 09e32333c96..5d9e009fa01 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -10,6 +10,8 @@ use std::{ usize, }; +use collections::HashMap; + use crate::mailbox::BasicMailbox; #[derive(Clone, Copy, Debug, PartialEq)] @@ -24,15 +26,26 @@ pub trait FsmScheduler { /// Schedule a Fsm for later handling. fn schedule(&self, fsm: Box); + /// Shutdown the scheduler, which indicates that resources like /// background thread pool should be released. fn shutdown(&self); + + /// Consume the resources of msg in resource controller if enabled, + /// otherwise do nothing. + fn consume_msg_resource(&self, msg: &::Message); +} + +pub trait ResourceMetered { + fn get_resource_consumptions(&self) -> Option> { + None + } } /// A `Fsm` is a finite state machine. It should be able to be notified for /// updating internal state according to incoming messages. -pub trait Fsm { - type Message: Send; +pub trait Fsm: Send + 'static { + type Message: Send + ResourceMetered; fn is_stopped(&self) -> bool; @@ -42,6 +55,7 @@ pub trait Fsm { Self: Sized, { } + /// Take the mailbox from FSM. Implementation should ensure there will be /// no reference to mailbox after calling this method. fn take_mailbox(&mut self) -> Option> diff --git a/components/batch-system/src/lib.rs b/components/batch-system/src/lib.rs index 9a307a534ac..f4f799dcc9a 100644 --- a/components/batch-system/src/lib.rs +++ b/components/batch-system/src/lib.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod batch; +mod channel; mod config; mod fsm; mod mailbox; @@ -16,7 +17,7 @@ pub use self::{ PollHandler, Poller, PoolState, }, config::Config, - fsm::{Fsm, FsmScheduler, Priority}, + fsm::{Fsm, FsmScheduler, Priority, ResourceMetered}, mailbox::{BasicMailbox, Mailbox}, router::Router, }; diff --git a/components/batch-system/src/mailbox.rs b/components/batch-system/src/mailbox.rs index 5afddf73c14..869031392af 100644 --- a/components/batch-system/src/mailbox.rs +++ b/components/batch-system/src/mailbox.rs @@ -75,6 +75,7 @@ impl BasicMailbox { msg: Owner::Message, scheduler: &S, ) -> Result<(), SendError> { + scheduler.consume_msg_resource(&msg); self.sender.force_send(msg)?; self.state.notify(scheduler, Cow::Borrowed(self)); Ok(()) @@ -89,6 +90,7 @@ impl BasicMailbox { msg: Owner::Message, scheduler: &S, ) -> Result<(), TrySendError> { + scheduler.consume_msg_resource(&msg); self.sender.try_send(msg)?; self.state.notify(scheduler, Cow::Borrowed(self)); Ok(()) diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index bfcb93c9d6b..ef937209531 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -12,12 +12,7 @@ use std::{ use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{ - debug, info, - lru::LruCache, - time::{duration_to_sec, Instant}, - Either, -}; +use tikv_util::{debug, info, lru::LruCache, time::Instant, Either}; use crate::{ fsm::{Fsm, FsmScheduler, FsmState}, @@ -322,7 +317,7 @@ where for mailbox in mailboxes.map.values() { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); } - BROADCAST_NORMAL_DURATION.observe(duration_to_sec(timer.saturating_elapsed())); + BROADCAST_NORMAL_DURATION.observe(timer.saturating_elapsed_secs()); } /// Try to notify all FSMs that the cluster is being shutdown. diff --git a/components/batch-system/src/test_runner.rs b/components/batch-system/src/test_runner.rs index 6be64d5d695..a3ae80dc55a 100644 --- a/components/batch-system/src/test_runner.rs +++ b/components/batch-system/src/test_runner.rs @@ -11,10 +11,11 @@ use std::{ }, }; +use collections::HashMap; use derive_more::{Add, AddAssign}; use tikv_util::mpsc; -use crate::*; +use crate::{fsm::ResourceMetered, *}; /// Message `Runner` can accepts. pub enum Message { @@ -22,6 +23,21 @@ pub enum Message { Loop(usize), /// `Runner` will call the callback directly. Callback(Box), + /// group name, write bytes + Resource(String, u64), +} + +impl ResourceMetered for Message { + fn get_resource_consumptions(&self) -> Option> { + match self { + Message::Resource(group_name, bytes) => { + let mut map = HashMap::default(); + map.insert(group_name.to_owned(), *bytes); + Some(map) + } + _ => None, + } + } } /// A simple runner used for benchmarking only. @@ -102,6 +118,7 @@ impl Handler { } } Ok(Message::Callback(cb)) => cb(self, r), + Ok(Message::Resource(..)) => {} Err(_) => break, } } diff --git a/components/batch-system/tests/cases/batch.rs b/components/batch-system/tests/cases/batch.rs index f950df68b8d..dc13affc363 100644 --- a/components/batch-system/tests/cases/batch.rs +++ b/components/batch-system/tests/cases/batch.rs @@ -7,13 +7,15 @@ use std::{ }; use batch_system::{test_runner::*, *}; +use kvproto::resource_manager::{GroupMode, GroupRawResourceSettings, ResourceGroup}; +use resource_control::ResourceGroupManager; use tikv_util::mpsc; #[test] fn test_batch() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); let metrics = builder.metrics.clone(); system.spawn("test".to_owned(), builder); @@ -55,7 +57,7 @@ fn test_batch() { fn test_priority() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); let (tx, rx) = mpsc::unbounded(); @@ -101,3 +103,102 @@ fn test_priority() { .unwrap(); assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(3)); } + +#[test] +fn test_resource_group() { + let (control_tx, control_fsm) = Runner::new(10); + let resource_manager = ResourceGroupManager::default(); + + let get_group = |name: &str, read_tokens: u64, write_tokens: u64| -> ResourceGroup { + let mut group = ResourceGroup::new(); + group.set_name(name.to_string()); + group.set_mode(GroupMode::RawMode); + let mut resource_setting = GroupRawResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_raw_resource_settings(resource_setting); + group + }; + + resource_manager.add_resource_group(get_group("group1", 10, 10)); + resource_manager.add_resource_group(get_group("group2", 100, 100)); + + let mut cfg = Config::default(); + cfg.pool_size = 1; + let (router, mut system) = batch_system::create_system( + &cfg, + control_tx, + control_fsm, + Some(resource_manager.derive_controller("test".to_string(), false)), + ); + let builder = Builder::new(); + system.spawn("test".to_owned(), builder); + let (tx, rx) = mpsc::unbounded(); + let tx_ = tx.clone(); + let r = router.clone(); + let state_cnt = Arc::new(AtomicUsize::new(0)); + router + .send_control(Message::Callback(Box::new( + move |_: &Handler, _: &mut Runner| { + let (tx, runner) = Runner::new(10); + r.register(1, BasicMailbox::new(tx, runner, state_cnt.clone())); + let (tx2, runner2) = Runner::new(10); + r.register(2, BasicMailbox::new(tx2, runner2, state_cnt)); + tx_.send(0).unwrap(); + }, + ))) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(0)); + + let tx_ = tx.clone(); + let (tx1, rx1) = std::sync::mpsc::sync_channel(0); + // block the thread + router + .send_control(Message::Callback(Box::new( + move |_: &Handler, _: &mut Runner| { + tx_.send(0).unwrap(); + tx1.send(0).unwrap(); + }, + ))) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(0)); + + router + .send(1, Message::Resource("group1".to_string(), 1)) + .unwrap(); + let tx_ = tx.clone(); + router + .send( + 1, + Message::Callback(Box::new(move |_: &Handler, _: &mut Runner| { + tx_.send(1).unwrap(); + })), + ) + .unwrap(); + + router + .send(2, Message::Resource("group2".to_string(), 1)) + .unwrap(); + router + .send( + 2, + Message::Callback(Box::new(move |_: &Handler, _: &mut Runner| { + tx.send(2).unwrap(); + })), + ) + .unwrap(); + + // pause the blocking thread + assert_eq!(rx1.recv_timeout(Duration::from_secs(3)), Ok(0)); + + // should recv from group2 first, because group2 has more tokens and it would be + // handled with higher priority. + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(2)); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(1)); +} diff --git a/components/batch-system/tests/cases/router.rs b/components/batch-system/tests/cases/router.rs index 543937fa8ef..d746dfad5cb 100644 --- a/components/batch-system/tests/cases/router.rs +++ b/components/batch-system/tests/cases/router.rs @@ -30,7 +30,7 @@ fn test_basic() { let (control_drop_tx, control_drop_rx) = mpsc::unbounded(); control_fsm.sender = Some(control_drop_tx); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); @@ -130,7 +130,7 @@ fn test_basic() { fn test_router_trace() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 6d64754d042..2b4eb9ff226 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1543,7 +1543,7 @@ mod tests { } let diff = cfg.diff(&updated_cfg); ep.run(Task::ChangeConfig(diff)); - assert_eq!(ep.config.min_ts_interval, ReadableDuration::millis(200)); + assert_eq!(ep.config.min_ts_interval, ReadableDuration::secs(1)); assert_eq!(ep.config.hibernate_regions_compatible, true); { diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 696bc6341ee..aac2842e404 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -273,6 +273,7 @@ mod tests { prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, initialized: true, + peer_id: raft::INVALID_ID, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { @@ -301,6 +302,7 @@ mod tests { prev_lead_transferee: 3, vote: 3, initialized: true, + peer_id: raft::INVALID_ID, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { diff --git a/components/concurrency_manager/Cargo.toml b/components/concurrency_manager/Cargo.toml index e225cbe0519..b391c1d239a 100644 --- a/components/concurrency_manager/Cargo.toml +++ b/components/concurrency_manager/Cargo.toml @@ -5,6 +5,7 @@ publish = false version = "0.0.1" [dependencies] +crossbeam-skiplist = "0.1" fail = "0.5" kvproto = { workspace = true } parking_lot = "0.12" @@ -12,12 +13,6 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["macros", "sync", "time"] } txn_types = { workspace = true } -# FIXME: switch to the crates.io version after crossbeam-skiplist is released -[dependencies.crossbeam-skiplist] -git = "https://github.com/tikv/crossbeam.git" -branch = "tikv-5.0" -package = "crossbeam-skiplist" - [dev-dependencies] criterion = "0.3" futures = "0.3" diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 5e6fbe87267..93218767ec0 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -92,6 +92,10 @@ impl MiscExt for PanicEngine { panic!() } + fn get_num_keys(&self) -> Result { + panic!() + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_panic/src/perf_context.rs b/components/engine_panic/src/perf_context.rs index 46d18c00e77..27bdd1ac066 100644 --- a/components/engine_panic/src/perf_context.rs +++ b/components/engine_panic/src/perf_context.rs @@ -8,7 +8,7 @@ use crate::engine::PanicEngine; impl PerfContextExt for PanicEngine { type PerfContext = PanicPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { panic!() } } diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index c3de53b4932..c0539c1edd5 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -67,6 +67,10 @@ impl RaftEngineReadOnly for PanicEngine { panic!() } + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result { + panic!() + } + fn get_recover_state(&self) -> Result> { panic!() } @@ -167,11 +171,12 @@ impl RaftEngine for PanicEngine { } impl RaftLogBatch for PanicWriteBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { - panic!() - } - - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { panic!() } @@ -231,6 +236,10 @@ impl RaftLogBatch for PanicWriteBatch { panic!() } + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { panic!() } diff --git a/components/engine_panic/src/snapshot.rs b/components/engine_panic/src/snapshot.rs index 296d7ce617a..f6cda5312cb 100644 --- a/components/engine_panic/src/snapshot.rs +++ b/components/engine_panic/src/snapshot.rs @@ -2,7 +2,9 @@ use std::ops::Deref; -use engine_traits::{IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + CfNamesExt, IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot, +}; use crate::{db_vector::PanicDbVector, engine::PanicEngine}; @@ -36,6 +38,12 @@ impl Iterable for PanicSnapshot { } } +impl CfNamesExt for PanicSnapshot { + fn cf_names(&self) -> Vec<&str> { + panic!() + } +} + pub struct PanicSnapshotIterator; impl Iterator for PanicSnapshotIterator { diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index b940fcb39f3..3bbf03cb77f 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -261,7 +261,7 @@ mod tests { let (region_id, tablet_index) = (2, 3); let storage = Arc::new(MemStorage::default()); - let state = Arc::new(FlushState::default()); + let state = Arc::new(FlushState::new(0)); let listener = PersistenceListener::new(region_id, tablet_index, state.clone(), storage.clone()); let mut db_opt = RocksDbOptions::default(); diff --git a/components/engine_rocks/src/logger.rs b/components/engine_rocks/src/logger.rs index b7b196448c5..85f4de713ac 100644 --- a/components/engine_rocks/src/logger.rs +++ b/components/engine_rocks/src/logger.rs @@ -20,6 +20,30 @@ impl Logger for RocksdbLogger { } } +pub struct TabletLogger { + tablet_name: String, +} + +impl TabletLogger { + pub fn new(tablet_name: String) -> Self { + Self { tablet_name } + } +} + +impl Logger for TabletLogger { + fn logv(&self, log_level: InfoLogLevel, log: &str) { + match log_level { + InfoLogLevel::Header => info!(#"rocksdb_log_header", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Debug => debug!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Info => info!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Warn => warn!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Error => error!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Fatal => crit!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + _ => {} + } + } +} + #[derive(Default)] pub struct RaftDbLogger; diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 55546869272..3477226ae76 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -2,7 +2,7 @@ use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, - Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, + Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, }; use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; @@ -258,7 +258,7 @@ impl MiscExt for RocksEngine { fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; - for cf in ALL_CFS { + for cf in self.cf_names() { let handle = util::get_cf_handle(self.as_inner(), cf)?; used_size += util::get_engine_cf_used_size(self.as_inner(), handle); } @@ -332,6 +332,18 @@ impl MiscExt for RocksEngine { .get_property_int_cf(handle, ROCKSDB_TOTAL_SST_FILES_SIZE)) } + fn get_num_keys(&self) -> Result { + let mut total = 0; + for cf in self.cf_names() { + let handle = util::get_cf_handle(self.as_inner(), cf).unwrap(); + total += self + .as_inner() + .get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) + .unwrap_or_default(); + } + Ok(total) + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_rocks/src/perf_context.rs b/components/engine_rocks/src/perf_context.rs index a731a9461dc..f8cfdbcc667 100644 --- a/components/engine_rocks/src/perf_context.rs +++ b/components/engine_rocks/src/perf_context.rs @@ -8,7 +8,7 @@ use crate::{engine::RocksEngine, perf_context_impl::PerfContextStatistics}; impl PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { RocksPerfContext::new(level, kind) } } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index d5331a2ce29..a0a5acd5dd8 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -166,6 +166,10 @@ impl RaftEngineReadOnly for RocksEngine { panic!() } + fn get_dirty_mark(&self, _raft_group_id: u64, _tablet_index: u64) -> Result { + panic!() + } + fn get_recover_state(&self) -> Result> { self.get_msg_cf(CF_DEFAULT, keys::RECOVER_STATE_KEY) } @@ -361,7 +365,19 @@ impl RaftEngine for RocksEngine { } impl RaftLogBatch for RocksWriteBatchVec { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + let overwrite_to = overwrite_to.unwrap_or(0); + if let Some(last) = entries.last() && last.get_index() + 1 < overwrite_to { + for index in last.get_index() + 1..overwrite_to { + let key = keys::raft_log_key(raft_group_id, index); + self.delete(&key).unwrap(); + } + } if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let ser_buf = Vec::with_capacity(max_size as usize); return self.append_impl(raft_group_id, &entries, ser_buf); @@ -369,13 +385,6 @@ impl RaftLogBatch for RocksWriteBatchVec { Ok(()) } - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { - for index in from..to { - let key = keys::raft_log_key(raft_group_id, index); - self.delete(&key).unwrap(); - } - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.put_msg(&keys::raft_state_key(raft_group_id), state) } @@ -434,6 +443,15 @@ impl RaftLogBatch for RocksWriteBatchVec { panic!() } + fn put_dirty_mark( + &mut self, + _raft_group_id: u64, + _tablet_index: u64, + _dirty: bool, + ) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.put_msg(keys::RECOVER_STATE_KEY, state) } diff --git a/components/engine_rocks/src/snapshot.rs b/components/engine_rocks/src/snapshot.rs index b19a32fd739..60a12c4ac6d 100644 --- a/components/engine_rocks/src/snapshot.rs +++ b/components/engine_rocks/src/snapshot.rs @@ -5,7 +5,9 @@ use std::{ sync::Arc, }; -use engine_traits::{self, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + self, CfNamesExt, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot, +}; use rocksdb::{rocksdb_options::UnsafeSnap, DBIterator, DB}; use crate::{ @@ -95,3 +97,9 @@ impl Peekable for RocksSnapshot { Ok(v.map(RocksDbVector::from_raw)) } } + +impl CfNamesExt for RocksSnapshot { + fn cf_names(&self) -> Vec<&str> { + self.db.cf_names() + } +} diff --git a/components/engine_tirocks/src/perf_context.rs b/components/engine_tirocks/src/perf_context.rs index d1d975c65c3..643967230df 100644 --- a/components/engine_tirocks/src/perf_context.rs +++ b/components/engine_tirocks/src/perf_context.rs @@ -136,7 +136,6 @@ impl engine_traits::PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; fn get_perf_context( - &self, level: engine_traits::PerfLevel, kind: engine_traits::PerfContextKind, ) -> Self::PerfContext { diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index cfed95f0426..8300348da8c 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -50,12 +50,18 @@ impl FlushProgress { /// raftstore will update state changes and corresponding apply index, when /// flush, `PersistenceListener` will query states related to the memtable /// and persist the relation to raft engine. -#[derive(Default, Debug)] +#[derive(Debug)] pub struct FlushState { applied_index: AtomicU64, } impl FlushState { + pub fn new(applied_index: u64) -> Self { + Self { + applied_index: AtomicU64::new(applied_index), + } + } + /// Set the latest applied index. #[inline] pub fn set_applied_index(&self, index: u64) { @@ -151,7 +157,10 @@ impl PersistenceListener { } match flushed_pr { Some(pr) => pr, - None => panic!("{} not found in {:?}", cf, prs), + None => panic!( + "[region_id={}] [tablet_index={}] {} not found in {:?}", + self.region_id, self.tablet_index, cf, prs + ), } }; self.storage diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index d9a07a1a915..5bbcbb2de79 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -115,6 +115,8 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn get_total_sst_files_size_cf(&self, cf: &str) -> Result>; + fn get_num_keys(&self) -> Result; + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_traits/src/perf_context.rs b/components/engine_traits/src/perf_context.rs index ba48974a460..44462e3fe3c 100644 --- a/components/engine_traits/src/perf_context.rs +++ b/components/engine_traits/src/perf_context.rs @@ -37,7 +37,7 @@ numeric_enum_serializing_mod! {perf_level_serde PerfLevel { pub trait PerfContextExt { type PerfContext: PerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext; + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext; } /// The subsystem the PerfContext is being created for. diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 9e95ae95e14..671fed8b3cf 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -33,6 +33,7 @@ pub trait RaftEngineReadOnly: Sync + Send + 'static { ) -> Result>; /// Get the flushed index of the given CF. fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result>; + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result; fn get_recover_state(&self) -> Result>; fn get_entry(&self, raft_group_id: u64, index: u64) -> Result>; @@ -66,7 +67,7 @@ pub trait RaftEngineDebug: RaftEngine + Sync + Send + 'static { Ok(true) }) .unwrap(); - batch.append(region_id, entries).unwrap(); + batch.append(region_id, None, entries).unwrap(); if let Some(state) = self.get_raft_state(region_id).unwrap() { batch.put_raft_state(region_id, &state).unwrap(); } @@ -150,11 +151,19 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send } pub trait RaftLogBatch: Send { - /// Note: `RaftLocalState` won't be updated in this call. - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()>; - - /// Remove Raft logs in [`from`, `to`) which will be overwritten later. - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64); + /// Append continuous entries to the batch. + /// + /// All existing entries with same index will be overwritten. If + /// `overwrite_to` is set to a larger value, then entries in + /// `[entries.last().get_index(), overwrite_to)` will be deleted. + /// Nothing will be deleted if entries is empty. Note: `RaftLocalState` + /// won't be updated in this call. + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()>; fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()>; @@ -193,6 +202,9 @@ pub trait RaftLogBatch: Send { apply_index: u64, ) -> Result<()>; + /// Mark a tablet may contain data that is not supposed to be in its range. + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()>; + /// Indicate whether region states should be recovered from raftdb and /// replay raft logs. /// When kvdb's write-ahead-log is disabled, the sequence number of the last diff --git a/components/engine_traits/src/snapshot.rs b/components/engine_traits/src/snapshot.rs index 7907abd1445..a5829161e25 100644 --- a/components/engine_traits/src/snapshot.rs +++ b/components/engine_traits/src/snapshot.rs @@ -2,7 +2,7 @@ use std::fmt::Debug; -use crate::{iterable::Iterable, peekable::Peekable}; +use crate::{iterable::Iterable, peekable::Peekable, CfNamesExt}; /// A consistent read-only view of the database. /// @@ -10,6 +10,6 @@ use crate::{iterable::Iterable, peekable::Peekable}; /// clonable, call `into_sync` to create a `SyncSnapshot`. pub trait Snapshot where - Self: 'static + Peekable + Iterable + Send + Sync + Sized + Debug, + Self: 'static + Peekable + Iterable + CfNamesExt + Send + Sync + Sized + Debug, { } diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index edc0bd99870..6bdfa97a6e6 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -31,6 +31,13 @@ pub struct CachedTablet { version: u64, } +impl CachedTablet { + fn release(&mut self) { + self.cache = None; + self.version = 0; + } +} + impl CachedTablet { #[inline] fn new(data: Option) -> Self { @@ -44,13 +51,11 @@ impl CachedTablet { } } - pub fn set(&mut self, data: EK) { - self.version = { - let mut latest_data = self.latest.data.lock().unwrap(); - *latest_data = Some(data.clone()); - self.latest.version.fetch_add(1, Ordering::Relaxed) + 1 - }; - self.cache = Some(data); + pub fn set(&mut self, data: EK) -> Option { + self.cache = Some(data.clone()); + let mut latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.fetch_add(1, Ordering::Relaxed) + 1; + latest_data.replace(data) } /// Get the tablet from cache without checking if it's up to date. @@ -69,19 +74,6 @@ impl CachedTablet { } self.cache() } - - /// Returns how many versions has passed. - #[inline] - pub fn refresh(&mut self) -> u64 { - let old_version = self.version; - if self.latest.version.load(Ordering::Relaxed) > old_version { - let latest_data = self.latest.data.lock().unwrap(); - self.version = self.latest.version.load(Ordering::Relaxed); - self.cache = latest_data.clone(); - return self.version - old_version; - } - 0 - } } /// Context to be passed to `TabletFactory`. @@ -222,10 +214,20 @@ impl TabletRegistry { }) } + /// Format the name as {prefix}_{id}_{suffix}. If prefix is empty, it will + /// be format as {id}_{suffix}. pub fn tablet_name(&self, prefix: &str, id: u64, suffix: u64) -> String { - format!("{}{}_{}", prefix, id, suffix) + format!( + "{}{:_(&self, path: &'a Path) -> Option<(&'a str, u64, u64)> { let name = path.file_name().unwrap().to_str().unwrap(); let mut parts = name.rsplit('_'); @@ -307,8 +309,10 @@ impl TabletRegistry { let mut tablets = self.tablets.tablets.lock().unwrap(); for (id, tablet) in tablets.iter_mut() { if !f(*id, tablet) { + tablet.release(); return; } + tablet.release(); } } } @@ -463,10 +467,19 @@ mod tests { }); assert_eq!(count, 1); - let name = registry.tablet_name("prefix_", 12, 30); + let name = registry.tablet_name("prefix", 12, 30); assert_eq!(name, "prefix_12_30"); let normal_name = registry.tablet_name("", 20, 15); let normal_tablet_path = registry.tablet_path(20, 15); assert_eq!(registry.tablet_root().join(normal_name), normal_tablet_path); + + let full_prefix_path = registry.tablet_root().join(name); + let res = registry.parse_tablet_name(&full_prefix_path); + assert_eq!(res, Some(("prefix", 12, 30))); + let res = registry.parse_tablet_name(&normal_tablet_path); + assert_eq!(res, Some(("", 20, 15))); + let invalid_path = registry.tablet_root().join("invalid_12"); + let res = registry.parse_tablet_name(&invalid_path); + assert_eq!(res, None); } } diff --git a/components/error_code/src/pd.rs b/components/error_code/src/pd.rs index 3ca2ac0b29f..782c4f3923b 100644 --- a/components/error_code/src/pd.rs +++ b/components/error_code/src/pd.rs @@ -12,5 +12,6 @@ define_error_codes!( REGION_NOT_FOUND => ("RegionNotFound", "", ""), STORE_TOMBSTONE => ("StoreTombstone", "", ""), GLOBAL_CONFIG_NOT_FOUND => ("GlobalConfigNotFound","",""), + DATA_COMPACTED => ("DataCompacted","",""), UNKNOWN => ("Unknown", "", "") ); diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index 1b6a85493cf..35dfe564ef0 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -32,6 +32,7 @@ define_error_codes!( RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), FLASHBACK_IN_PROGRESS => ("FlashbackInProgress", "", ""), FLASHBACK_NOT_PREPARED => ("FlashbackNotPrepared", "", ""), + IS_WITNESS => ("IsWitness", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -70,6 +71,8 @@ impl ErrorCodeExt for errorpb::Error { FLASHBACK_IN_PROGRESS } else if self.has_flashback_not_prepared() { FLASHBACK_NOT_PREPARED + } else if self.has_is_witness() { + IS_WITNESS } else { UNKNOWN } diff --git a/components/keys/Cargo.toml b/components/keys/Cargo.toml index 5f2bf5935ee..b5a6412d00a 100644 --- a/components/keys/Cargo.toml +++ b/components/keys/Cargo.toml @@ -10,6 +10,7 @@ kvproto = { workspace = true } log_wrappers = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] panic_hook = { workspace = true } diff --git a/components/keys/src/rewrite.rs b/components/keys/src/rewrite.rs index 51f588e9732..68541bb50e0 100644 --- a/components/keys/src/rewrite.rs +++ b/components/keys/src/rewrite.rs @@ -6,11 +6,21 @@ use std::ops::Bound::{self, *}; +use tikv_util::codec::bytes::encode_bytes; + /// An error indicating the key cannot be rewritten because it does not start /// with the given prefix. #[derive(PartialEq, Debug, Clone)] pub struct WrongPrefix; +pub fn encode_bound(bound: Bound>) -> Bound> { + match bound { + Included(k) => Included(encode_bytes(&k)), + Excluded(k) => Excluded(encode_bytes(&k)), + Unbounded => Unbounded, + } +} + /// Rewrites the prefix of a byte array. pub fn rewrite_prefix( old_prefix: &[u8], diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index c25e37f23b5..f46d6111c5d 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -19,6 +19,7 @@ lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } +prometheus-static-metric = "0.5" security = { workspace = true } semver = "0.10" serde = "1.0" diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 9f466a6a351..b0c21797a91 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -1,7 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::HashMap, fmt, sync::{ atomic::{AtomicU64, Ordering}, @@ -27,10 +26,8 @@ use kvproto::{ }; use security::SecurityManager; use tikv_util::{ - box_err, debug, error, info, thd_name, - time::{duration_to_sec, Instant}, - timer::GLOBAL_TIMER_HANDLE, - warn, Either, HandyRwLock, + box_err, debug, error, info, thd_name, time::Instant, timer::GLOBAL_TIMER_HANDLE, warn, Either, + HandyRwLock, }; use txn_types::TimeStamp; use yatp::{task::future::TaskCell, ThreadPool}; @@ -194,9 +191,7 @@ impl RpcClient { &self, key: &[u8], ) -> PdFuture<(metapb::Region, Option)> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_region.start_coarse_timer(); let mut req = pdpb::GetRegionRequest::default(); req.set_header(self.header()); @@ -256,8 +251,8 @@ impl RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store_async"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_store_async + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; let store = resp.take_store(); if store.get_state() != metapb::StoreState::Tombstone { @@ -286,10 +281,47 @@ impl fmt::Debug for RpcClient { const LEADER_CHANGE_RETRY: usize = 10; impl PdClient for RpcClient { - fn load_global_config(&self, list: Vec) -> PdFuture> { - use kvproto::pdpb::LoadGlobalConfigRequest; - let mut req = LoadGlobalConfigRequest::new(); - req.set_names(list.into()); + fn store_global_config( + &self, + config_path: String, + items: Vec, + ) -> PdFuture<()> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .store_global_config + .start_coarse_timer(); + + let mut req = pdpb::StoreGlobalConfigRequest::new(); + req.set_config_path(config_path); + req.set_changes(items.into()); + let executor = move |client: &Client, req| match client + .inner + .rl() + .client_stub + .store_global_config_async(&req) + { + Ok(grpc_response) => Box::pin(async move { + if let Err(err) = grpc_response.await { + return Err(box_err!("{:?}", err)); + } + Ok(()) + }) as PdFuture<_>, + Err(err) => Box::pin(async move { Err(box_err!("{:?}", err)) }) as PdFuture<_>, + }; + self.pd_client + .request(req, executor, LEADER_CHANGE_RETRY) + .execute() + } + + fn load_global_config( + &self, + config_path: String, + ) -> PdFuture<(Vec, i64)> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .load_global_config + .start_coarse_timer(); + + let mut req = pdpb::LoadGlobalConfigRequest::new(); + req.set_config_path(config_path); let executor = |client: &Client, req| match client .inner .rl() @@ -299,17 +331,10 @@ impl PdClient for RpcClient { { Ok(grpc_response) => Box::pin(async move { match grpc_response.await { - Ok(grpc_response) => { - let mut res = HashMap::with_capacity(grpc_response.get_items().len()); - for c in grpc_response.get_items() { - if c.has_error() { - error!("failed to load global config with key {:?}", c.get_error()); - } else { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } - } - Ok(res) - } + Ok(grpc_response) => Ok(( + Vec::from(grpc_response.get_items()), + grpc_response.get_revision(), + )), Err(err) => Err(box_err!("{:?}", err)), } }) as PdFuture<_>, @@ -322,9 +347,17 @@ impl PdClient for RpcClient { fn watch_global_config( &self, + config_path: String, + revision: i64, ) -> Result> { - use kvproto::pdpb::WatchGlobalConfigRequest; - let req = WatchGlobalConfigRequest::default(); + let _timer = PD_REQUEST_HISTOGRAM_VEC + .watch_global_config + .start_coarse_timer(); + + let mut req = pdpb::WatchGlobalConfigRequest::default(); + info!("[global_config] start watch global config"; "path" => &config_path, "revision" => revision); + req.set_config_path(config_path); + req.set_revision(revision); sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, _| { client.watch_global_config(&req) }) @@ -340,7 +373,7 @@ impl PdClient for RpcClient { region: metapb::Region, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["bootstrap_cluster"]) + .bootstrap_cluster .start_coarse_timer(); let mut req = pdpb::BootstrapRequest::default(); @@ -357,7 +390,7 @@ impl PdClient for RpcClient { fn is_cluster_bootstrapped(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_cluster_bootstrapped"]) + .is_cluster_bootstrapped .start_coarse_timer(); let mut req = pdpb::IsBootstrappedRequest::default(); @@ -372,9 +405,7 @@ impl PdClient for RpcClient { } fn alloc_id(&self) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["alloc_id"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.alloc_id.start_coarse_timer(); let mut req = pdpb::AllocIdRequest::default(); req.set_header(self.header()); @@ -393,7 +424,7 @@ impl PdClient for RpcClient { fn is_recovering_marked(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_recovering_marked"]) + .is_recovering_marked .start_coarse_timer(); let mut req = pdpb::IsSnapshotRecoveringRequest::default(); @@ -408,9 +439,7 @@ impl PdClient for RpcClient { } fn put_store(&self, store: metapb::Store) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["put_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.put_store.start_coarse_timer(); let mut req = pdpb::PutStoreRequest::default(); req.set_header(self.header()); @@ -425,9 +454,7 @@ impl PdClient for RpcClient { } fn get_store(&self, store_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_store.start_coarse_timer(); let mut req = pdpb::GetStoreRequest::default(); req.set_header(self.header()); @@ -451,9 +478,7 @@ impl PdClient for RpcClient { } fn get_all_stores(&self, exclude_tombstone: bool) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_all_stores"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_all_stores.start_coarse_timer(); let mut req = pdpb::GetAllStoresRequest::default(); req.set_header(self.header()); @@ -469,7 +494,7 @@ impl PdClient for RpcClient { fn get_cluster_config(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_cluster_config"]) + .get_cluster_config .start_coarse_timer(); let mut req = pdpb::GetClusterConfigRequest::default(); @@ -521,8 +546,8 @@ impl PdClient for RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; if resp.has_region() { Ok(Some(resp.take_region())) @@ -563,8 +588,8 @@ impl PdClient for RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_leader_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_leader_by_id + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; if resp.has_region() && resp.has_leader() { Ok(Some((resp.take_region(), resp.take_leader()))) @@ -700,8 +725,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp) }) as PdFuture<_> @@ -738,8 +763,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_batch_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp) }) as PdFuture<_> @@ -784,8 +809,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_heartbeat"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .store_heartbeat + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; match feature_gate.set_version(resp.get_cluster_version()) { Err(_) => warn!("invalid cluster version: {}", resp.get_cluster_version()), @@ -821,8 +846,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["report_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .report_batch_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -834,9 +859,7 @@ impl PdClient for RpcClient { } fn scatter_region(&self, mut region: RegionInfo) -> Result<()> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["scatter_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.scatter_region.start_coarse_timer(); let mut req = pdpb::ScatterRegionRequest::default(); req.set_header(self.header()); @@ -875,8 +898,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_gc_safe_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_gc_safe_point + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp.get_safe_point()) }) as PdFuture<_> @@ -892,9 +915,7 @@ impl PdClient for RpcClient { } fn get_operator(&self, region_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_operator"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_operator.start_coarse_timer(); let mut req = pdpb::GetOperatorRequest::default(); req.set_header(self.header()); @@ -909,7 +930,7 @@ impl PdClient for RpcClient { } fn batch_get_tso(&self, count: u32) -> PdFuture { - let begin = Instant::now(); + let timer = Instant::now(); let executor = move |client: &Client, _| { // Remove Box::pin and Compat when GLOBAL_TIMER_HANDLE supports futures 0.3 let ts_fut = Compat::new(Box::pin(client.inner.rl().tso.get_timestamp(count))); @@ -928,8 +949,8 @@ impl PdClient for RpcClient { } })?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["tso"]) - .observe(duration_to_sec(begin.saturating_elapsed())); + .tso + .observe(timer.saturating_elapsed_secs()); Ok(ts) }) as PdFuture<_> }; @@ -944,7 +965,7 @@ impl PdClient for RpcClient { safe_point: TimeStamp, ttl: Duration, ) -> PdFuture<()> { - let begin = Instant::now(); + let timer = Instant::now(); let mut req = pdpb::UpdateServiceGcSafePointRequest::default(); req.set_header(self.header()); req.set_service_id(name.into()); @@ -966,8 +987,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["update_service_safe_point"]) - .observe(duration_to_sec(begin.saturating_elapsed())); + .update_service_safe_point + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -1002,8 +1023,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["min_resolved_ts"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .min_resolved_ts + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 3d17a94a494..cfa0d46303c 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -47,12 +47,8 @@ use kvproto::{ }; use security::SecurityManager; use tikv_util::{ - box_err, error, info, - mpsc::future as mpsc, - slow_log, thd_name, - time::{duration_to_sec, Instant}, - timer::GLOBAL_TIMER_HANDLE, - warn, + box_err, error, info, mpsc::future as mpsc, slow_log, thd_name, time::Instant, + timer::GLOBAL_TIMER_HANDLE, warn, }; use tokio::sync::{broadcast, mpsc as tokio_mpsc}; use txn_types::TimeStamp; @@ -542,7 +538,7 @@ pub trait PdClient { fn fetch_cluster_id(&mut self) -> Result; - fn load_global_config(&mut self, list: Vec) -> PdFuture>; + fn load_global_config(&mut self, config_path: String) -> PdFuture>; fn watch_global_config( &mut self, @@ -791,10 +787,10 @@ impl PdClient for RpcClient { Ok((tx, resp_rx)) } - fn load_global_config(&mut self, list: Vec) -> PdFuture> { + fn load_global_config(&mut self, config_path: String) -> PdFuture> { use kvproto::pdpb::LoadGlobalConfigRequest; let mut req = LoadGlobalConfigRequest::new(); - req.set_names(list.into()); + req.set_config_path(config_path); let mut raw_client = self.raw_client.clone(); Box::pin(async move { raw_client.wait_for_ready().await?; @@ -803,11 +799,7 @@ impl PdClient for RpcClient { Ok(grpc_response) => { let mut res = HashMap::with_capacity(grpc_response.get_items().len()); for c in grpc_response.get_items() { - if c.has_error() { - error!("failed to load global config with key {:?}", c.get_error()); - } else { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } + res.insert(c.get_name().to_owned(), c.get_value().to_owned()); } Ok(res) } @@ -839,7 +831,7 @@ impl PdClient for RpcClient { region: metapb::Region, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["bootstrap_cluster"]) + .bootstrap_cluster .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -860,7 +852,7 @@ impl PdClient for RpcClient { fn is_cluster_bootstrapped(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_cluster_bootstrapped"]) + .is_cluster_bootstrapped .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -879,9 +871,7 @@ impl PdClient for RpcClient { } fn alloc_id(&mut self) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["alloc_id"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.alloc_id.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -906,7 +896,7 @@ impl PdClient for RpcClient { fn is_recovering_marked(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_recovering_marked"]) + .is_recovering_marked .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -925,9 +915,7 @@ impl PdClient for RpcClient { } fn put_store(&mut self, store: metapb::Store) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["put_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.put_store.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -966,8 +954,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store_and_stats"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_store_and_stats + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; let store = resp.take_store(); @@ -980,9 +968,7 @@ impl PdClient for RpcClient { } fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_all_stores"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_all_stores.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -1002,7 +988,7 @@ impl PdClient for RpcClient { fn get_cluster_config(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_cluster_config"]) + .get_cluster_config .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -1041,8 +1027,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; let region = if resp.has_region() { @@ -1080,8 +1066,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; if resp.has_region() { @@ -1119,8 +1105,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_leader_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_leader_by_id + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; if resp.has_region() && resp.has_leader() { @@ -1149,8 +1135,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp) @@ -1183,8 +1169,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_batch_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp) @@ -1227,8 +1213,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_heartbeat"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .store_heartbeat + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; match feature_gate.set_version(resp.get_cluster_version()) { @@ -1261,8 +1247,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["report_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .report_batch_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) @@ -1270,9 +1256,7 @@ impl PdClient for RpcClient { } fn scatter_region(&mut self, mut region: RegionInfo) -> Result<()> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["scatter_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.scatter_region.start_coarse_timer(); let mut req = pdpb::ScatterRegionRequest::default(); req.set_region_id(region.get_id()); @@ -1311,8 +1295,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_gc_saft_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_gc_safe_point + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp.get_safe_point()) @@ -1320,9 +1304,7 @@ impl PdClient for RpcClient { } fn get_operator(&mut self, region_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_operator"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_operator.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -1370,8 +1352,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["update_service_safe_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .update_service_safe_point + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) @@ -1400,8 +1382,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["min_resolved_ts"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .min_resolved_ts + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) diff --git a/components/pd_client/src/errors.rs b/components/pd_client/src/errors.rs index 61adceec391..5bacca03354 100644 --- a/components/pd_client/src/errors.rs +++ b/components/pd_client/src/errors.rs @@ -26,6 +26,8 @@ pub enum Error { StoreTombstone(String), #[error("global config item {0} not found")] GlobalConfigNotFound(String), + #[error("required watch revision is smaller than current compact/min revision. {0:?}")] + DataCompacted(String), } pub type Result = result::Result; @@ -33,7 +35,10 @@ pub type Result = result::Result; impl Error { pub fn retryable(&self) -> bool { match self { - Error::Grpc(_) | Error::ClusterNotBootstrapped(_) | Error::StreamDisconnect(_) => true, + Error::Grpc(_) + | Error::ClusterNotBootstrapped(_) + | Error::StreamDisconnect(_) + | Error::DataCompacted(_) => true, Error::Other(_) | Error::RegionNotFound(_) | Error::StoreTombstone(_) @@ -55,6 +60,7 @@ impl ErrorCodeExt for Error { Error::RegionNotFound(_) => error_code::pd::REGION_NOT_FOUND, Error::StoreTombstone(_) => error_code::pd::STORE_TOMBSTONE, Error::GlobalConfigNotFound(_) => error_code::pd::GLOBAL_CONFIG_NOT_FOUND, + Error::DataCompacted(_) => error_code::pd::DATA_COMPACTED, Error::Other(_) => error_code::pd::UNKNOWN, } } diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 8674130c799..b877750770d 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -14,15 +14,14 @@ mod util; mod config; pub mod errors; -use std::{cmp::Ordering, collections::HashMap, ops::Deref, sync::Arc, time::Duration}; +use std::{cmp::Ordering, ops::Deref, sync::Arc, time::Duration}; use futures::future::BoxFuture; -use grpcio::ClientSStreamReceiver; use kvproto::{ metapb, pdpb, replication_modepb::{RegionReplicationStatus, ReplicationStatus, StoreDrAutoSyncStatus}, }; -use pdpb::{QueryStats, WatchGlobalConfigResponse}; +use pdpb::QueryStats; use tikv_util::time::{Instant, UnixSecs}; use txn_types::TimeStamp; @@ -201,6 +200,8 @@ impl BucketStat { } pub const INVALID_ID: u64 = 0; +// TODO: Implementation of config registration for each module +pub const RESOURCE_CONTROL_CONFIG_PATH: &str = "resource_group/settings"; /// PdClient communicates with Placement Driver (PD). /// Because now one PD only supports one cluster, so it is no need to pass @@ -209,17 +210,28 @@ pub const INVALID_ID: u64 = 0; /// all the time. pub trait PdClient: Send + Sync { /// Load a list of GlobalConfig - fn load_global_config(&self, _list: Vec) -> PdFuture> { + fn load_global_config( + &self, + _config_path: String, + ) -> PdFuture<(Vec, i64)> { unimplemented!(); } /// Store a list of GlobalConfig - fn store_global_config(&self, _list: HashMap) -> PdFuture<()> { + fn store_global_config( + &self, + _config_path: String, + _items: Vec, + ) -> PdFuture<()> { unimplemented!(); } /// Watching change of GlobalConfig - fn watch_global_config(&self) -> Result> { + fn watch_global_config( + &self, + _config_path: String, + _revision: i64, + ) -> Result> { unimplemented!(); } diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index 57879a57d0e..a4ef9c5ce4e 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -2,14 +2,52 @@ use lazy_static::lazy_static; use prometheus::*; +use prometheus_static_metric::{make_static_metric, register_static_histogram_vec}; + +make_static_metric! { + pub label_enum PDRequestEventType { + get_region, + get_region_by_id, + get_region_leader_by_id, + scatter_region, + get_store, + get_store_async, + put_store, + get_all_stores, + get_store_and_stats, + store_global_config, + load_global_config, + watch_global_config, + bootstrap_cluster, + is_cluster_bootstrapped, + get_cluster_config, + ask_split, + ask_batch_split, + report_batch_split, + get_gc_safe_point, + update_service_safe_point, + min_resolved_ts, + get_operator, + alloc_id, + is_recovering_marked, + store_heartbeat, + tso, + } + + pub struct PDRequestEventHistogramVec: Histogram { + "type" => PDRequestEventType, + } +} lazy_static! { - pub static ref PD_REQUEST_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( - "tikv_pd_request_duration_seconds", - "Bucketed histogram of PD requests duration", - &["type"] - ) - .unwrap(); + pub static ref PD_REQUEST_HISTOGRAM_VEC: PDRequestEventHistogramVec = + register_static_histogram_vec!( + PDRequestEventHistogramVec, + "tikv_pd_request_duration_seconds", + "Bucketed histogram of PD requests duration", + &["type"] + ) + .unwrap(); pub static ref PD_HEARTBEAT_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_pd_heartbeat_message_total", "Total number of PD heartbeat messages.", diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 72c8cc16b04..fd58cd921d8 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -873,6 +873,7 @@ pub fn check_resp_header(header: &ResponseHeader) -> Result<()> { ErrorType::GlobalConfigNotFound => { Err(Error::GlobalConfigNotFound(err.get_message().to_owned())) } + ErrorType::DataCompacted => Err(Error::DataCompacted(err.get_message().to_owned())), ErrorType::Ok => Ok(()), ErrorType::DuplicatedEntry | ErrorType::EntryNotFound => Err(box_err!(err.get_message())), ErrorType::Unknown => Err(box_err!(err.get_message())), diff --git a/components/profiler/Cargo.toml b/components/profiler/Cargo.toml index b0c456b209f..e5583a631d5 100644 --- a/components/profiler/Cargo.toml +++ b/components/profiler/Cargo.toml @@ -18,4 +18,5 @@ valgrind_request = { version = "1.1.0", optional = true } [[example]] name = "prime" +path = "examples/prime.rs" required-features = ["profiling"] diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 7c98adf325f..92d7a4f7353 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -366,7 +366,7 @@ impl RaftLogEngine { impl PerfContextExt for RaftLogEngine { type PerfContext = RaftEnginePerfContext; - fn get_perf_context(&self, _level: PerfLevel, _kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(_level: PerfLevel, _kind: PerfContextKind) -> Self::PerfContext { RaftEnginePerfContext } } @@ -381,21 +381,23 @@ const REGION_STATE_KEY: &[u8] = &[0x03]; const APPLY_STATE_KEY: &[u8] = &[0x04]; const RECOVER_STATE_KEY: &[u8] = &[0x05]; const FLUSH_STATE_KEY: &[u8] = &[0x06]; +const DIRTY_MARK_KEY: &[u8] = &[0x07]; // All keys are of the same length. const KEY_PREFIX_LEN: usize = RAFT_LOG_STATE_KEY.len(); impl RaftLogBatchTrait for RaftLogBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + _overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + // overwrite is handled within raft log engine. self.0 .add_entries::(raft_group_id, &entries) .map_err(transfer_error) } - fn cut_logs(&mut self, _: u64, _: u64, _: u64) { - // It's unnecessary because overlapped entries can be handled in - // `append`. - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.0 .put_message(raft_group_id, RAFT_LOG_STATE_KEY.to_vec(), state) @@ -470,8 +472,21 @@ impl RaftLogBatchTrait for RaftLogBatch { let key = encode_flushed_key(cf, tablet_index); let mut value = vec![0; 8]; NumberCodec::encode_u64(&mut value, apply_index); - self.0.put(raft_group_id, key.to_vec(), value); - Ok(()) + self.0 + .put(raft_group_id, key.to_vec(), value) + .map_err(transfer_error) + } + + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { + let key = encode_key(DIRTY_MARK_KEY, tablet_index); + if dirty { + self.0 + .put(raft_group_id, key.to_vec(), vec![]) + .map_err(transfer_error) + } else { + self.0.delete(raft_group_id, key.to_vec()); + Ok(()) + } } fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { @@ -600,6 +615,11 @@ impl RaftEngineReadOnly for RaftLogEngine { Ok(index) } + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result { + let key = encode_key(DIRTY_MARK_KEY, tablet_index); + Ok(self.0.get(raft_group_id, &key).is_some()) + } + fn get_recover_state(&self) -> Result> { self.0 .get_message(STORE_STATE_ID, RECOVER_STATE_KEY) diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 6726c5ed742..5b917b9ddf7 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -52,6 +52,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raft-proto = { version = "0.7.0" } raftstore = { workspace = true } rand = "0.8.3" +resource_control = { workspace = true } resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index a3800085522..1c7360a86bc 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -26,16 +26,18 @@ use raftstore::{ store::{ fsm::store::{PeerTickBatch, ENTRY_CACHE_EVICT_TICK_DURATION}, local_metrics::RaftMetrics, - Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, StoreWriters, - TabletSnapManager, Transport, WriteSenders, + AutoSplitController, Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, + StoreWriters, TabletSnapManager, Transport, WriteSenders, }, }; +use resource_metering::CollectorRegHandle; use slog::{warn, Logger}; use tikv_util::{ box_err, config::{Tracker, VersionTrack}, + log::SlogFormat, sys::SysQuota, - time::Instant as TiInstant, + time::{duration_to_sec, Instant as TiInstant}, timer::SteadyTimer, worker::{LazyWorker, Scheduler, Worker}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, @@ -45,7 +47,7 @@ use time::Timespec; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, - operation::SPLIT_PREFIX, + operation::{SharedReadTablet, SPLIT_PREFIX}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, worker::{pd, tablet_gc}, @@ -72,7 +74,7 @@ pub struct StoreContext { pub timer: SteadyTimer, pub schedulers: Schedulers, /// store meta - pub store_meta: Arc>, + pub store_meta: Arc>>, pub engine: ER, pub tablet_registry: TabletRegistry, pub apply_pool: FuturePool, @@ -121,6 +123,7 @@ struct StorePoller { /// Buffers to hold in-coming messages. store_msg_buf: Vec, peer_msg_buf: Vec, + timer: tikv_util::time::Instant, /// These fields controls the timing of flushing messages generated by /// FSMs. last_flush_time: TiInstant, @@ -134,6 +137,7 @@ impl StorePoller { cfg_tracker, store_msg_buf: Vec::new(), peer_msg_buf: Vec::new(), + timer: tikv_util::time::Instant::now(), last_flush_time: TiInstant::now(), need_flush_events: false, } @@ -153,6 +157,7 @@ impl StorePoller { fn flush_events(&mut self) { self.schedule_ticks(); + self.poll_ctx.raft_metrics.maybe_flush(); } fn schedule_ticks(&mut self) { @@ -183,6 +188,9 @@ impl PollHandler Option { @@ -232,7 +240,13 @@ impl PollHandler>>]) {} + fn end(&mut self, _batch: &mut [Option>>]) { + let dur = self.timer.saturating_elapsed(); + self.poll_ctx + .raft_metrics + .process_ready + .observe(duration_to_sec(dur)); + } fn pause(&mut self) { if self.poll_ctx.trans.need_flush() { @@ -258,7 +272,7 @@ struct StorePollerBuilder { schedulers: Schedulers, apply_pool: FuturePool, logger: Logger, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, } @@ -272,7 +286,7 @@ impl StorePollerBuilder { router: StoreRouter, schedulers: Schedulers, logger: Logger, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, coprocessor_host: CoprocessorHost, ) -> Self { @@ -338,9 +352,9 @@ impl StorePollerBuilder { let prev = regions.insert(region_id, (sender, peer_fsm)); if let Some((_, p)) = prev { return Err(box_err!( - "duplicate region {:?} vs {:?}", - p.logger().list(), - regions[®ion_id].1.logger().list() + "duplicate region {} vs {}", + SlogFormat(p.logger()), + SlogFormat(regions[®ion_id].1.logger()) )); } Ok(()) @@ -428,13 +442,22 @@ pub struct Schedulers { pub split_check: Scheduler, } +impl Schedulers { + fn stop(&self) { + self.read.stop(); + self.pd.stop(); + self.tablet_gc.stop(); + self.split_check.stop(); + } +} + /// A set of background threads that will processing offloaded work from /// raftstore. struct Workers { /// Worker for fetching raft logs asynchronously async_read: Worker, pd: LazyWorker, - tablet_gc_worker: Worker, + tablet_gc: Worker, async_write: StoreWriters, purge: Option, @@ -447,18 +470,29 @@ impl Workers { Self { async_read: Worker::new("async-read-worker"), pd, - tablet_gc_worker: Worker::new("tablet-gc-worker"), + tablet_gc: Worker::new("tablet-gc-worker"), async_write: StoreWriters::default(), purge, background, } } + + fn stop(mut self) { + self.async_write.shutdown(); + self.async_read.stop(); + self.pd.stop(); + self.tablet_gc.stop(); + if let Some(w) = self.purge { + w.stop(); + } + } } /// The system used for polling Raft activities. pub struct StoreSystem { system: BatchSystem, StoreFsm>, workers: Option>, + schedulers: Option>, logger: Logger, shutdown: Arc, } @@ -473,11 +507,13 @@ impl StoreSystem { trans: T, pd_client: Arc, router: &StoreRouter, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, ) -> Result<()> @@ -493,7 +529,9 @@ impl StoreSystem { .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); }); - let purge_worker = if raft_engine.need_manual_purge() { + let purge_worker = if raft_engine.need_manual_purge() + && !cfg.value().raft_engine_purge_interval.0.is_zero() + { let worker = Worker::new("purge-worker"); let raft_clone = raft_engine.clone(); let logger = self.logger.clone(); @@ -529,13 +567,19 @@ impl StoreSystem { pd_client, raft_engine.clone(), tablet_registry.clone(), + snap_mgr.clone(), router.clone(), workers.pd.remote(), concurrency_manager, causal_ts_provider, + workers.pd.scheduler(), + auto_split_controller, + store_meta.lock().unwrap().region_read_progress.clone(), + collector_reg_handle, self.logger.clone(), self.shutdown.clone(), - )); + cfg.clone(), + )?); let split_check_scheduler = workers.background.start( "split-check", @@ -546,7 +590,7 @@ impl StoreSystem { ), ); - let tablet_gc_scheduler = workers.tablet_gc_worker.start( + let tablet_gc_scheduler = workers.tablet_gc.start_with_timer( "tablet-gc-worker", tablet_gc::Runner::new(tablet_registry.clone(), self.logger.clone()), ); @@ -566,13 +610,14 @@ impl StoreSystem { tablet_registry, trans, router.clone(), - schedulers, + schedulers.clone(), self.logger.clone(), store_meta.clone(), snap_mgr, coprocessor_host, ); self.workers = Some(workers); + self.schedulers = Some(schedulers); let peers = builder.init()?; // Choose a different name so we know what version is actually used. rs stands // for raft store. @@ -583,9 +628,14 @@ impl StoreSystem { let mut address = Vec::with_capacity(peers.len()); { let mut meta = store_meta.as_ref().lock().unwrap(); - for (region_id, (tx, fsm)) in peers { - meta.readers - .insert(region_id, fsm.peer().generate_read_delegate()); + for (region_id, (tx, mut fsm)) in peers { + if let Some(tablet) = fsm.peer_mut().tablet() { + let read_tablet = SharedReadTablet::new(tablet.clone()); + meta.readers.insert( + region_id, + (fsm.peer().generate_read_delegate(), read_tablet), + ); + } address.push(region_id); mailboxes.push(( @@ -610,18 +660,16 @@ impl StoreSystem { if self.workers.is_none() { return; } - let mut workers = self.workers.take().unwrap(); + let workers = self.workers.take().unwrap(); - // TODO: gracefully shutdown future pool + // TODO: gracefully shutdown future apply pool + // Stop schedulers first, so all background future worker pool will be stopped + // gracefully. + self.schedulers.take().unwrap().stop(); self.system.shutdown(); - workers.async_write.shutdown(); - workers.async_read.stop(); - workers.pd.stop(); - if let Some(w) = workers.purge { - w.stop(); - } + workers.stop(); } } @@ -701,10 +749,11 @@ where { let (store_tx, store_fsm) = StoreFsm::new(cfg, store_id, logger.clone()); let (router, system) = - batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); + batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm, None); let system = StoreSystem { system, workers: None, + schedulers: None, logger: logger.clone(), shutdown: Arc::new(AtomicBool::new(false)), }; diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index c0eabd2120e..1544a703c6d 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -10,7 +10,7 @@ use crossbeam::channel::TryRecvError; use engine_traits::{FlushState, KvEngine, TabletRegistry}; use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; -use raftstore::store::ReadTask; +use raftstore::store::{Config, ReadTask}; use slog::Logger; use tikv_util::{ mpsc::future::{self, Receiver, Sender, WakePolicy}, @@ -58,6 +58,7 @@ pub struct ApplyFsm { impl ApplyFsm { pub fn new( + cfg: &Config, peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, @@ -65,10 +66,12 @@ impl ApplyFsm { read_scheduler: Scheduler>, flush_state: Arc, log_recovery: Option>, + applied_term: u64, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); let apply = Apply::new( + cfg, peer, region_state, res_reporter, @@ -76,6 +79,7 @@ impl ApplyFsm { read_scheduler, flush_state, log_recovery, + applied_term, logger, ); ( @@ -98,6 +102,7 @@ impl ApplyFsm { res = self.receiver.next().fuse() => res, _ = timeout.fuse() => None, }; + self.apply.on_start_apply(); let mut task = match res { Some(r) => r, None => { @@ -114,9 +119,10 @@ impl ApplyFsm { ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), + ApplyTask::ManualFlush => self.apply.on_manual_flush().await, } - // TODO: yield after some time. + self.apply.maybe_flush().await; // Perhaps spin sometime? match self.receiver.try_recv() { @@ -125,7 +131,8 @@ impl ApplyFsm { Err(TryRecvError::Disconnected) => return, } } - self.apply.flush(); + let written_bytes = self.apply.flush(); + self.apply.maybe_reschedule(written_bytes).await; } } } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 49f1efcb760..26d5c2a1458 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -7,7 +7,7 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; -use raftstore::store::{Config, LocksStatus, TabletSnapManager, Transport}; +use raftstore::store::{Config, TabletSnapManager, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, @@ -32,7 +32,6 @@ pub struct PeerFsm { /// twice accidentally. tick_registry: [bool; PeerTick::VARIANT_COUNT], is_stopped: bool, - reactivate_memory_lock_ticks: usize, } impl PeerFsm { @@ -43,7 +42,11 @@ impl PeerFsm { storage: Storage, ) -> Result> { let peer = Peer::new(cfg, tablet_registry, snap_mgr, storage)?; - info!(peer.logger, "create peer"); + info!(peer.logger, "create peer"; + "raft_state" => ?peer.storage().raft_state(), + "apply_state" => ?peer.storage().apply_state(), + "region_state" => ?peer.storage().region_state() + ); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(PeerFsm { peer, @@ -51,7 +54,6 @@ impl PeerFsm { receiver: rx, tick_registry: [false; PeerTick::VARIANT_COUNT], is_stopped: false, - reactivate_memory_lock_ticks: 0, }); Ok((tx, fsm)) } @@ -132,9 +134,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, fn schedule_pending_ticks(&mut self) { let pending_ticks = self.fsm.peer.take_pending_ticks(); for tick in pending_ticks { - if tick == PeerTick::ReactivateMemoryLock { - self.fsm.reactivate_memory_lock_ticks = 0; - } self.schedule_tick(tick); } } @@ -187,20 +186,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } fn on_start(&mut self) { - self.schedule_tick(PeerTick::Raft); + if !self.fsm.peer.maybe_pause_for_recovery(self.store_ctx) { + self.schedule_tick(PeerTick::Raft); + } self.schedule_tick(PeerTick::SplitRegionCheck); self.schedule_tick(PeerTick::PdHeartbeat); self.schedule_tick(PeerTick::CompactLog); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } - // Unlike v1, it's a must to set ready when there are pending entries. Otherwise - // it may block for ever when there is unapplied conf change. - let entry_storage = self.fsm.peer.storage().entry_storage(); - if entry_storage.commit_index() > entry_storage.applied_index() - // Speed up setup if there is only one peer. - || self.fsm.peer.is_leader() - { + // Speed up setup if there is only one peer. + if self.fsm.peer.is_leader() { self.fsm.peer.set_has_ready(); } } @@ -224,9 +220,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerTick::CheckPeerStaleState => unimplemented!(), PeerTick::EntryCacheEvict => self.on_entry_cache_evict(), PeerTick::CheckLeaderLease => unimplemented!(), - PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), + PeerTick::ReactivateMemoryLock => { + self.fsm.peer.on_reactivate_memory_lock_tick(self.store_ctx) + } PeerTick::ReportBuckets => unimplemented!(), - PeerTick::CheckLongUncommitted => unimplemented!(), + PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted(), } } @@ -306,7 +304,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .peer_mut() .on_request_split(self.store_ctx, request, ch) } + PeerMsg::UpdateRegionSize { size } => { + self.fsm.peer_mut().on_update_region_size(size) + } + PeerMsg::UpdateRegionKeys { keys } => { + self.fsm.peer_mut().on_update_region_keys(keys) + } + PeerMsg::ClearRegionSize => self.fsm.peer_mut().on_clear_region_size(), PeerMsg::ForceCompactLog => self.on_compact_log_tick(true), + PeerMsg::TabletTrimmed { tablet_index } => { + self.fsm.peer_mut().on_tablet_trimmed(tablet_index) + } #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } @@ -315,32 +323,4 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.fsm.peer.propose_pending_writes(self.store_ctx); self.schedule_pending_ticks(); } - - pub fn on_reactivate_memory_lock_tick(&mut self) { - let mut pessimistic_locks = self.fsm.peer.txn_ext().pessimistic_locks.write(); - - // If it is not leader, we needn't reactivate by tick. In-memory pessimistic - // lock will be enabled when this region becomes leader again. - // And this tick is currently only used for the leader transfer failure case. - if !self.fsm.peer().is_leader() - || pessimistic_locks.status != LocksStatus::TransferringLeader - { - return; - } - - self.fsm.reactivate_memory_lock_ticks += 1; - let transferring_leader = self.fsm.peer.raft_group().raft.lead_transferee.is_some(); - // `lead_transferee` is not set immediately after the lock status changes. So, - // we need the tick count condition to avoid reactivating too early. - if !transferring_leader - && self.fsm.reactivate_memory_lock_ticks - >= self.store_ctx.cfg.reactive_memory_lock_timeout_tick - { - pessimistic_locks.status = LocksStatus::Normal; - self.fsm.reactivate_memory_lock_ticks = 0; - } else { - drop(pessimistic_locks); - self.schedule_tick(PeerTick::ReactivateMemoryLock); - } - } } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index cb7aa99b179..17c0a9a50f9 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -12,25 +12,26 @@ use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; use keys::{data_end_key, data_key}; use kvproto::metapb::Region; -use raftstore::store::{ - fsm::store::StoreRegionMeta, Config, ReadDelegate, RegionReadProgressRegistry, -}; +use raftstore::store::{fsm::store::StoreRegionMeta, Config, RegionReadProgressRegistry}; use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, is_zero_duration, + log::SlogFormat, mpsc::{self, LooseBoundedSender, Receiver}, + slog_panic, }; use crate::{ batch::StoreContext, + operation::ReadDelegatePair, router::{StoreMsg, StoreTick}, }; -pub struct StoreMeta { +pub struct StoreMeta { pub store_id: u64, /// region_id -> reader - pub readers: HashMap, + pub readers: HashMap>, /// region_id -> `RegionReadProgress` pub region_read_progress: RegionReadProgressRegistry, /// (region_end_key, epoch.version) -> region_id @@ -42,9 +43,9 @@ pub struct StoreMeta { pub(crate) regions: HashMap, } -impl StoreMeta { - pub fn new(store_id: u64) -> StoreMeta { - StoreMeta { +impl StoreMeta { + pub fn new(store_id: u64) -> Self { + Self { store_id, readers: HashMap::default(), region_read_progress: RegionReadProgressRegistry::default(), @@ -61,12 +62,12 @@ impl StoreMeta { .insert(region_id, (region.clone(), initialized)); // `prev` only makes sense when it's initialized. if let Some((prev, prev_init)) = prev && prev_init { - assert!(initialized, "{:?} region corrupted", logger.list()); + assert!(initialized, "{} region corrupted", SlogFormat(logger)); if prev.get_region_epoch().get_version() != version { let prev_id = self.region_ranges.remove(&(data_end_key(prev.get_end_key()), prev.get_region_epoch().get_version())); - assert_eq!(prev_id, Some(region_id), "{:?} region corrupted", logger.list()); + assert_eq!(prev_id, Some(region_id), "{} region corrupted", SlogFormat(logger)); } else { - assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{:?} region corrupted", logger.list()); + assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{} region corrupted", SlogFormat(logger)); return; } } @@ -75,14 +76,28 @@ impl StoreMeta { self.region_ranges .insert((data_end_key(region.get_end_key()), version), region_id) .is_none(), - "{:?} region corrupted", - logger.list() + "{} region corrupted", + SlogFormat(logger) ); } } + + pub fn remove_region(&mut self, region_id: u64) { + let prev = self.regions.remove(®ion_id); + if let Some((prev, initialized)) = prev { + if initialized { + let key = ( + data_end_key(prev.get_end_key()), + prev.get_region_epoch().get_version(), + ); + let prev_id = self.region_ranges.remove(&key); + assert_eq!(prev_id, Some(prev.get_id())); + } + } + } } -impl StoreRegionMeta for StoreMeta { +impl StoreRegionMeta for StoreMeta { #[inline] fn store_id(&self) -> u64 { self.store_id @@ -203,7 +218,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { fn on_start(&mut self) { if self.fsm.store.start_time.is_some() { - panic!("{:?} unable to start again", self.fsm.store.logger.list(),); + slog_panic!(self.fsm.store.logger, "store is already started"); } self.fsm.store.start_time = Some( @@ -251,6 +266,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { .fsm .store .on_store_unreachable(self.store_ctx, to_store_id), + #[cfg(feature = "testexport")] + StoreMsg::WaitFlush { region_id, ch } => { + self.fsm.store.on_wait_flush(self.store_ctx, region_id, ch) + } } } } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 7ddb1687d91..b82b6de3931 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -41,4 +41,4 @@ pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; -pub use worker::pd::{FlowReporter, Task as PdTask}; +pub use worker::pd::{PdReporter, Task as PdTask}; diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index d1d10d366bf..0f5fd9b392f 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -17,11 +17,13 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; use protobuf::Message; use raftstore::{ - store::{fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask}, + store::{ + fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + }, Result, }; use slog::{debug, error, info}; -use tikv_util::{box_err, Either}; +use tikv_util::{box_err, log::SlogFormat}; use crate::{ batch::StoreContext, @@ -32,6 +34,52 @@ use crate::{ worker::tablet_gc, }; +#[derive(Debug)] +pub struct CompactLogContext { + skipped_ticks: usize, + approximate_log_size: u64, + last_applying_index: u64, + /// Tombstone tablets can only be destroyed when the tablet that replaces it + /// is persisted. This is a list of tablet index that awaits to be + /// persisted. When persisted_apply is advanced, we need to notify tablet_gc + /// worker to destroy them. + tombstone_tablets_wait_index: Vec, +} + +impl CompactLogContext { + pub fn new(last_applying_index: u64) -> CompactLogContext { + CompactLogContext { + skipped_ticks: 0, + approximate_log_size: 0, + last_applying_index, + tombstone_tablets_wait_index: vec![], + } + } + + #[inline] + pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { + if self.skipped_ticks < max_skip_ticks { + self.skipped_ticks += 1; + true + } else { + false + } + } + + pub fn add_log_size(&mut self, size: u64) { + self.approximate_log_size += size; + } + + pub fn set_last_applying_index(&mut self, index: u64) { + self.last_applying_index = index; + } + + #[inline] + pub fn last_applying_index(&self) -> u64 { + self.last_applying_index + } +} + impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { pub fn on_compact_log_tick(&mut self, force: bool) { if !self.fsm.peer().is_leader() { @@ -130,13 +178,16 @@ impl Peer { replicated_idx } else if applied_idx > first_idx && applied_idx - first_idx >= store_ctx.cfg.raft_log_gc_count_limit() - || self.approximate_raft_log_size() >= store_ctx.cfg.raft_log_gc_size_limit().0 + || self.compact_log_context().approximate_log_size + >= store_ctx.cfg.raft_log_gc_size_limit().0 { std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) } else if replicated_idx < first_idx || last_idx - first_idx < 3 || replicated_idx - first_idx < store_ctx.cfg.raft_log_gc_threshold - && self.maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) + && self + .compact_log_context_mut() + .maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) { return; } else { @@ -163,7 +214,7 @@ impl Peer { let (ch, _) = CmdResChannel::pair(); self.on_admin_command(store_ctx, req, ch); - self.reset_skip_compact_log_ticks(); + self.compact_log_context_mut().skipped_ticks = 0; } } @@ -217,6 +268,81 @@ impl Apply { } impl Peer { + #[inline] + pub fn record_tombstone_tablet( + &mut self, + ctx: &StoreContext, + old_tablet: EK, + new_tablet_index: u64, + ) { + let compact_log_context = self.compact_log_context_mut(); + compact_log_context + .tombstone_tablets_wait_index + .push(new_tablet_index); + let _ = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::prepare_destroy( + old_tablet, + self.region_id(), + new_tablet_index, + )); + } + + /// Returns if there's any tombstone being removed. + #[inline] + fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { + let compact_log_context = self.compact_log_context_mut(); + let removed = compact_log_context + .tombstone_tablets_wait_index + .iter() + .take_while(|i| **i <= persisted) + .count(); + if removed > 0 { + compact_log_context + .tombstone_tablets_wait_index + .drain(..removed); + true + } else { + false + } + } + + pub fn has_pending_tombstone_tablets(&self) -> bool { + !self + .compact_log_context() + .tombstone_tablets_wait_index + .is_empty() + } + + #[inline] + pub fn record_tombstone_tablet_for_destroy( + &mut self, + ctx: &StoreContext, + task: &mut WriteTask, + ) { + assert!( + !self.has_pending_tombstone_tablets(), + "{} all tombstone should be cleared before being destroyed.", + SlogFormat(&self.logger) + ); + let tablet = match self.tablet() { + Some(tablet) => tablet.clone(), + None => return, + }; + let region_id = self.region_id(); + let applied_index = self.entry_storage().applied_index(); + let sched = ctx.schedulers.tablet_gc.clone(); + let _ = sched.schedule(tablet_gc::Task::prepare_destroy( + tablet, + self.region_id(), + applied_index, + )); + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet_gc::Task::destroy(region_id, applied_index)); + })); + } + pub fn on_apply_res_compact_log( &mut self, store_ctx: &mut StoreContext, @@ -255,9 +381,29 @@ impl Peer { .unwrap(); self.set_has_extra_write(); - self.maybe_compact_log_from_engine(store_ctx, Either::Right(old_truncated)); + // All logs < perssited_apply will be deleted, so should check with +1. + if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() + && let Some(index) = self.compact_log_index() { + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, index, self.state_changes_mut()) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } + // Extra write set right above. + } + + let context = self.compact_log_context_mut(); + let applied = context.last_applying_index; + let total_cnt = applied - old_truncated; + let remain_cnt = applied - res.compact_index; + context.approximate_log_size = + (context.approximate_log_size as f64 * (remain_cnt as f64 / total_cnt as f64)) as u64; } + /// Called when apply index is persisted. #[inline] pub fn on_advance_persisted_apply_index( &mut self, @@ -269,51 +415,56 @@ impl Peer { if old_persisted < new_persisted { let region_id = self.region_id(); // TODO: batch it. + // TODO: avoid allocation if there is nothing to delete. if let Err(e) = store_ctx.engine.delete_all_but_one_states_before( region_id, new_persisted, - self.state_changes_mut(), + task.extra_write + .ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)), ) { error!(self.logger, "failed to delete raft states"; "err" => ?e); - } else { - self.set_has_extra_write(); } - self.maybe_compact_log_from_engine(store_ctx, Either::Left(old_persisted)); - if self.remove_tombstone_tablets_before(new_persisted) { + // If it's snapshot, logs are gc already. + if !task.has_snapshot + && old_persisted < self.entry_storage().truncated_index() + 1 + && let Some(index) = self.compact_log_index() { + let batch = task.extra_write.ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)); + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, index, batch) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } + } + if self.remove_tombstone_tablets(new_persisted) { let sched = store_ctx.schedulers.tablet_gc.clone(); - task.persisted_cbs.push(Box::new(move || { + if !task.has_snapshot { + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); + })); + } else { + // In snapshot, the index is persisted, tablet can be destroyed directly. let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); - })) + } } } } - pub fn maybe_compact_log_from_engine( - &mut self, - store_ctx: &mut StoreContext, - old_index: Either, - ) { - let truncated = self.entry_storage().truncated_index(); - let persisted = self.storage().apply_trace().persisted_apply_index(); - match old_index { - Either::Left(old_persisted) if old_persisted >= truncated => return, - Either::Right(old_truncated) if old_truncated >= persisted => return, - _ => {} - } - let compact_index = std::cmp::min(truncated, persisted); - // Raft Engine doesn't care about first index. - if let Err(e) = - store_ctx - .engine - .gc(self.region_id(), 0, compact_index, self.state_changes_mut()) - { - error!(self.logger, "failed to compact raft logs"; "err" => ?e); - } else { - self.set_has_extra_write(); - let applied = self.storage().apply_state().get_applied_index(); - let total_cnt = applied - self.storage().entry_storage().first_index() + 1; - let remain_cnt = applied - compact_index; - self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); + fn compact_log_index(&mut self) -> Option { + let truncated = self.entry_storage().truncated_index() + 1; + let persisted_applied = self.storage().apply_trace().persisted_apply_index(); + let compact_index = std::cmp::min(truncated, persisted_applied); + if compact_index == RAFT_INIT_LOG_INDEX + 1 { + // There is no logs at RAFT_INIT_LOG_INDEX, nothing to delete. + return None; } + // TODO: make this debug when stable. + info!(self.logger, "compact log"; + "index" => compact_index, + "apply_trace" => ?self.storage().apply_trace(), + "truncated" => ?self.entry_storage().apply_state()); + Some(compact_index) } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 5a6c91d3567..42c433584fe 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -9,7 +9,7 @@ use std::time::Instant; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ metapb::{self, PeerRole}, raft_cmdpb::{AdminRequest, AdminResponse, ChangePeerRequest, RaftCmdRequest}, @@ -27,7 +27,7 @@ use raftstore::{ Error, Result, }; use slog::{error, info, warn}; -use tikv_util::box_err; +use tikv_util::{box_err, slog_panic}; use super::AdminCmdResult; use crate::{ @@ -146,7 +146,7 @@ impl Peer { let remove_self = conf_change.region_state.get_state() == PeerState::Tombstone; self.storage_mut() - .set_region_state(conf_change.region_state); + .set_region_state(conf_change.region_state.clone()); if self.is_leader() { info!( self.logger, @@ -189,7 +189,14 @@ impl Peer { self.raft_group().raft.state, ); if remove_self { + // When self is destroyed, all metas will be cleaned in `start_destroy`. self.mark_for_destroy(None); + } else { + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, conf_change.index, &conf_change.region_state) + .unwrap(); + self.set_has_extra_write(); } } } @@ -232,7 +239,7 @@ impl Apply { ) -> Result<(AdminResponse, AdminCmdResult)> { let region = self.region_state().get_region(); let change_kind = ConfChangeKind::confchange_kind(changes.len()); - info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch()); + info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch(), "index" => index); let mut new_region = region.clone(); match change_kind { ConfChangeKind::LeaveJoint => self.apply_leave_joint(&mut new_region), @@ -254,6 +261,7 @@ impl Apply { "changes" => ?changes, "legacy" => legacy, "original region" => ?region, "err" => ?e); + return Err(e); } } let conf_ver = region.get_region_epoch().get_conf_ver() + changes.len() as u64; @@ -312,10 +320,10 @@ impl Apply { change_num += 1; } if change_num == 0 { - panic!( - "{:?} can't leave a non-joint config, region: {:?}", - self.logger.list(), - self.region_state() + slog_panic!( + self.logger, + "can't leave a non-joint config"; + "region" => ?self.region_state() ); } let conf_ver = region.get_region_epoch().get_conf_ver() + change_num; @@ -433,11 +441,11 @@ impl Apply { if let Some(exist_peer) = tikv_util::store::find_peer(region, store_id) { let r = exist_peer.get_role(); if r == PeerRole::IncomingVoter || r == PeerRole::DemotingVoter { - panic!( - "{:?} can't apply confchange because configuration is still in joint state, confchange: {:?}, region: {:?}", - self.logger.list(), - cp, - self.region_state() + slog_panic!( + self.logger, + "can't apply confchange because configuration is still in joint state"; + "confchange" => ?cp, + "region_state" => ?self.region_state() ); } } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 977e26e0675..52bc5329dd4 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -5,6 +5,7 @@ mod conf_change; mod split; mod transfer_leader; +pub use compact_log::CompactLogContext; use compact_log::CompactLogResult; use conf_change::ConfChangeResult; use engine_traits::{KvEngine, RaftEngine}; @@ -14,7 +15,7 @@ use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; use split::SplitResult; pub use split::{temp_split_path, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; use crate::{batch::StoreContext, raft::Peer, router::CmdResChannel}; @@ -42,7 +43,10 @@ impl Peer { return; } if !req.has_admin_request() { - let e = box_err!("{:?} expect only execute admin command", self.logger.list()); + let e = box_err!( + "{} expect only execute admin command", + SlogFormat(&self.logger) + ); let resp = cmd_resp::new_error(e); ch.report_error(resp); return; @@ -66,8 +70,8 @@ impl Peer { // checker. if !self.applied_to_current_term() { let e = box_err!( - "{:?} peer has not applied to current term, applied_term {}, current_term {}", - self.logger.list(), + "{} peer has not applied to current term, applied_term {}, current_term {}", + SlogFormat(&self.logger), self.storage().entry_storage().applied_term(), self.term() ); @@ -110,9 +114,13 @@ impl Peer { } }; match &res { - Ok(index) => self - .proposal_control_mut() - .record_proposed_admin(cmd_type, *index), + Ok(index) => { + self.proposal_control_mut() + .record_proposed_admin(cmd_type, *index); + if self.proposal_control_mut().has_uncommitted_admin() { + self.raft_group_mut().skip_bcast_commit(false); + } + } Err(e) => { info!( self.logger, diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index e1f4ae552f6..f9e44286490 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,7 +25,7 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::{borrow::Cow, cmp, path::PathBuf}; +use std::{any::Any, borrow::Cow, cmp, path::PathBuf}; use collections::HashSet; use crossbeam::channel::SendError; @@ -54,18 +54,19 @@ use raftstore::{ Result, }; use slog::info; +use tikv_util::{log::SlogFormat, slog_panic}; use crate::{ batch::StoreContext, fsm::{ApplyResReporter, PeerFsmDelegate}, - operation::AdminCmdResult, + operation::{AdminCmdResult, SharedReadTablet}, raft::{Apply, Peer}, router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, worker::tablet_gc, Error, }; -pub const SPLIT_PREFIX: &str = "split_"; +pub const SPLIT_PREFIX: &str = "split"; #[derive(Debug)] pub struct SplitResult { @@ -73,6 +74,10 @@ pub struct SplitResult { // The index of the derived region in `regions` pub derived_index: usize, pub tablet_index: u64, + // Hack: in common case we should use generic, but split is an infrequent + // event that performance is not critical. And using `Any` can avoid polluting + // all existing code. + tablet: Box, } #[derive(Debug)] @@ -86,6 +91,8 @@ pub struct SplitInit { /// In-memory pessimistic locks that should be inherited from parent region pub locks: PeerPessimisticLocks, + approximate_size: Option, + approximate_keys: Option, } impl SplitInit { @@ -118,6 +125,20 @@ pub struct SplitFlowControl { size_diff_hint: i64, skip_split_count: u64, may_skip_split_check: bool, + approximate_size: Option, + approximate_keys: Option, +} + +impl SplitFlowControl { + #[inline] + pub fn approximate_size(&self) -> Option { + self.approximate_size + } + + #[inline] + pub fn approximate_keys(&self) -> Option { + self.approximate_keys + } } pub fn temp_split_path(registry: &TabletRegistry, region_id: u64) -> PathBuf { @@ -168,9 +189,31 @@ impl Peer { false } + pub fn on_update_region_size(&mut self, size: u64) { + self.split_flow_control_mut().approximate_size = Some(size); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::PdHeartbeat); + } + + pub fn on_update_region_keys(&mut self, keys: u64) { + self.split_flow_control_mut().approximate_keys = Some(keys); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::PdHeartbeat); + } + + pub fn on_clear_region_size(&mut self) { + let control = self.split_flow_control_mut(); + control.approximate_size.take(); + control.approximate_keys.take(); + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + pub fn update_split_flow_control(&mut self, metrics: &ApplyMetrics) { let control = self.split_flow_control_mut(); control.size_diff_hint += metrics.size_diff_hint; + if self.is_leader() { + self.add_pending_tick(PeerTick::SplitRegionCheck); + } } pub fn on_request_split( @@ -265,6 +308,7 @@ impl Apply { self.logger, "split region"; "region" => ?region, + "index" => log_index, "boundaries" => %KeysInfoFormatter(boundaries.iter()), ); @@ -322,10 +366,10 @@ impl Apply { // We will freeze the memtable rather than flush it in the following PR. let tablet = self.tablet().clone(); let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint object: {:?}", - self.logger.list(), - e + slog_panic!( + self.logger, + "fails to create checkpoint object"; + "error" => ?e ) }); @@ -340,11 +384,11 @@ impl Apply { checkpointer .create_at(&split_temp_path, None, 0) .unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint with path {:?}: {:?}", - self.logger.list(), - split_temp_path, - e + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %split_temp_path.display(), + "error" => ?e ) }); } @@ -358,16 +402,14 @@ impl Apply { checkpointer .create_at(&derived_path, None, 0) .unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint with path {:?}: {:?}", - self.logger.list(), - derived_path, - e + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %derived_path.display(), + "error" => ?e ) }); } - // Remove the old write batch. - self.write_batch.take(); let reg = self.tablet_registry(); let path = reg.tablet_path(region_id, log_index); let mut ctx = TabletContext::new(®ions[derived_index], Some(log_index)); @@ -376,7 +418,7 @@ impl Apply { // TODO: Should we avoid flushing for the old tablet? ctx.flush_state = Some(self.flush_state().clone()); let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); - self.publish_tablet(tablet); + self.set_tablet(tablet.clone()); self.region_state_mut() .set_region(regions[derived_index].clone()); @@ -392,6 +434,7 @@ impl Apply { regions, derived_index, tablet_index: log_index, + tablet: Box::new(tablet), }), )) } @@ -406,27 +449,19 @@ impl Peer { fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); let derived = &res.regions[res.derived_index]; - let derived_epoch = derived.get_region_epoch().clone(); let region_id = derived.get_id(); - // Group in-memory pessimistic locks in the original region into new regions. - // The locks of new regions will be put into the corresponding new regions - // later. And the locks belonging to the old region will stay in the original - // map. - let region_locks = { - let mut pessimistic_locks = self.txn_ext().pessimistic_locks.write(); - info!(self.logger, "moving {} locks to new regions", pessimistic_locks.len();); - // Update the version so the concurrent reader will fail due to EpochNotMatch - // instead of PessimisticLockNotFound. - pessimistic_locks.version = derived_epoch.get_version(); - pessimistic_locks.group_by_regions(&res.regions, derived) - }; + let region_locks = self.txn_context().split(&res.regions, derived); fail_point!("on_split_invalidate_locks"); + let tablet: EK = match res.tablet.downcast() { + Ok(t) => *t, + Err(t) => unreachable!("tablet type should be the same: {:?}", t), + }; { let mut meta = store_ctx.store_meta.lock().unwrap(); meta.set_region(derived, true, &self.logger); - let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); + let (reader, read_tablet) = meta.readers.get_mut(&derived.get_id()).unwrap(); self.set_region( &store_ctx.coprocessor_host, reader, @@ -434,8 +469,19 @@ impl Peer { RegionChangeReason::Split, res.tablet_index, ); + + // Tablet should be updated in lock to match the epoch. + *read_tablet = SharedReadTablet::new(tablet.clone()); + } + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(store_ctx, tablet, res.tablet_index); } + let new_region_count = res.regions.len() as u64; + let control = self.split_flow_control_mut(); + let estimated_size = control.approximate_size.map(|v| v / new_region_count); + let estimated_keys = control.approximate_keys.map(|v| v / new_region_count); + self.post_split(); if self.is_leader() { @@ -449,16 +495,25 @@ impl Peer { // Now pd only uses ReportBatchSplit for history operation show, // so we send it independently here. self.report_batch_split_pd(store_ctx, res.regions.to_vec()); + // After split, the peer may need to update its metrics. + let control = self.split_flow_control_mut(); + control.may_skip_split_check = false; + control.approximate_size = estimated_size; + control.approximate_keys = estimated_keys; self.add_pending_tick(PeerTick::SplitRegionCheck); } - - self.record_tablet_as_tombstone_and_refresh(res.tablet_index, store_ctx); + self.storage_mut().set_has_dirty_data(true); + let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); + let tablet_index = res.tablet_index; let _ = store_ctx .schedulers .tablet_gc .schedule(tablet_gc::Task::trim( self.tablet().unwrap().clone(), derived, + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, )); let last_region_id = res.regions.last().unwrap().get_id(); @@ -476,6 +531,8 @@ impl Peer { source_id: region_id, check_split: last_region_id == new_region_id, scheduled: false, + approximate_size: estimated_size, + approximate_keys: estimated_keys, locks, })); @@ -488,10 +545,10 @@ impl Peer { .router .force_send_control(StoreMsg::SplitInit(msg)) .unwrap_or_else(|e| { - panic!( - "{:?} fails to send split peer intialization msg to store : {:?}", - self.logger.list(), - e + slog_panic!( + self.logger, + "fails to send split peer intialization msg to store"; + "error" => ?e, ) }); } @@ -503,6 +560,9 @@ impl Peer { self.state_changes_mut() .put_region_state(region_id, res.tablet_index, ®ion_state) .unwrap(); + self.state_changes_mut() + .put_dirty_mark(region_id, res.tablet_index, true) + .unwrap(); self.set_has_extra_write(); } @@ -539,11 +599,11 @@ impl Peer { let res = self.raft_group_mut().step(msg); let accept_snap = self.raft_group().snap().is_some(); if res.is_err() || !accept_snap { - panic!( - "{:?} failed to accept snapshot {:?} with error {}", - self.logger.list(), - res, - accept_snap + slog_panic!( + self.logger, + "failed to accept snapshot"; + "accept_snapshot" => accept_snap, + "res" => ?res, ); } let prev = self.storage_mut().split_init_mut().replace(split_init); @@ -556,13 +616,21 @@ impl Peer { store_ctx: &mut StoreContext, split_init: Box, ) { - let _ = store_ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::trim( - self.tablet().unwrap().clone(), - self.region(), - )); + let region_id = self.region_id(); + if self.storage().has_dirty_data() { + let tablet_index = self.storage().tablet_index(); + let mailbox = store_ctx.router.mailbox(region_id).unwrap(); + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } if split_init.source_leader && self.leader_id() == INVALID_ID && self.term() == RAFT_INIT_LOG_TERM @@ -570,12 +638,14 @@ impl Peer { let _ = self.raft_group_mut().campaign(); self.set_has_ready(); - *self.txn_ext().pessimistic_locks.write() = split_init.locks; + self.txn_context().init_with_lock(split_init.locks); + let control = self.split_flow_control_mut(); + control.approximate_size = split_init.approximate_size; + control.approximate_keys = split_init.approximate_keys; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. self.region_heartbeat_pd(store_ctx); } - let region_id = self.region_id(); if split_init.check_split { self.add_pending_tick(PeerTick::SplitRegionCheck); @@ -593,7 +663,7 @@ impl Peer { break; } } - assert!(found, "{:?} {}", self.logger.list(), region_id); + assert!(found, "{} {}", SlogFormat(&self.logger), region_id); let split_trace = self.split_trace_mut(); let mut off = 0; let mut admin_flushed = 0; @@ -615,6 +685,19 @@ impl Peer { self.set_has_extra_write(); } } + + pub fn on_tablet_trimmed(&mut self, tablet_index: u64) { + info!(self.logger, "tablet is trimmed"; "tablet_index" => tablet_index); + let region_id = self.region_id(); + let changes = self.state_changes_mut(); + changes + .put_dirty_mark(region_id, tablet_index, false) + .unwrap(); + self.set_has_extra_write(); + if self.storage().tablet_index() == tablet_index { + self.storage_mut().set_has_dirty_data(false); + } + } } #[cfg(test)] @@ -626,17 +709,17 @@ mod test { use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::TestTabletFactory, + kv::{KvTestEngine, TestTabletFactory}, }; use engine_traits::{ - Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, + FlushState, Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, }; use kvproto::{ metapb::RegionEpoch, raft_cmdpb::{BatchSplitRequest, SplitRequest}, raft_serverpb::{PeerState, RegionLocalState}, }; - use raftstore::store::cmd_resp::new_error; + use raftstore::store::{cmd_resp::new_error, Config}; use slog::o; use tempfile::TempDir; use tikv_util::{ @@ -673,7 +756,7 @@ mod test { } fn assert_split( - apply: &mut Apply, + apply: &mut Apply, parent_id: u64, right_derived: bool, new_region_ids: Vec, @@ -777,6 +860,7 @@ mod test { let (read_scheduler, _rx) = dummy_scheduler(); let (reporter, _) = MockReporter::new(); let mut apply = Apply::new( + &Config::default(), region .get_peers() .iter() @@ -787,8 +871,9 @@ mod test { reporter, reg, read_scheduler, - Arc::default(), + Arc::new(FlushState::new(5)), None, + 5, logger.clone(), ); @@ -803,7 +888,7 @@ mod test { splits.mut_requests().clear(); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 6).unwrap_err(); // Empty requests should be rejected. assert!(err.to_string().contains("missing split requests")); @@ -824,7 +909,7 @@ mod test { .mut_requests() .push(new_split_req(b"", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 7).unwrap_err(); // Empty key will not in any region exclusively. assert!(err.to_string().contains("missing split key"), "{:?}", err); @@ -836,7 +921,7 @@ mod test { .mut_requests() .push(new_split_req(b"k1", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 8).unwrap_err(); // keys should be in ascend order. assert!( err.to_string().contains("invalid split request"), @@ -852,7 +937,7 @@ mod test { .mut_requests() .push(new_split_req(b"k2", 1, vec![11, 12])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 9).unwrap_err(); // All requests should be checked. assert!(err.to_string().contains("id count"), "{:?}", err); @@ -963,6 +1048,7 @@ mod test { // Split will create checkpoint tablet, so if there are some writes before // split, they should be flushed immediately. apply.apply_put(CF_DEFAULT, 50, b"k04", b"v4").unwrap(); + apply.apply_flow_control_mut().set_need_flush(true); assert!(!WriteBatch::is_empty(apply.write_batch.as_ref().unwrap())); splits.mut_requests().clear(); splits diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 12bd7bbf491..54aa9845e17 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -3,22 +3,19 @@ use std::cmp::Ordering; use bytes::Bytes; -use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; -use fail::fail_point; +use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ disk_usage::DiskUsage, metapb, raft_cmdpb::{ - AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, RaftRequestHeader, - TransferLeaderRequest, + AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, TransferLeaderRequest, }, }; -use parking_lot::RwLockWriteGuard; use raft::{eraftpb, ProgressState, Storage}; use raftstore::{ store::{ fsm::new_admin_request, make_transfer_leader_response, metrics::PEER_ADMIN_CMD_COUNTER, - LocksStatus, TRANSFER_LEADER_COMMAND_REPLY_CTX, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }, Result, }; @@ -30,9 +27,8 @@ use super::AdminCmdResult; use crate::{ batch::StoreContext, fsm::ApplyResReporter, - operation::command::write::SimpleWriteEncoder, raft::{Apply, Peer}, - router::{CmdResChannel, PeerMsg, PeerTick}, + router::{CmdResChannel, PeerMsg}, }; fn transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderRequest> { @@ -296,91 +292,6 @@ impl Peer { } None } - - // Returns whether we should propose another TransferLeader command. This is - // for: - // - Considering the amount of pessimistic locks can be big, it can reduce - // unavailable time caused by waiting for the transferee catching up logs. - // - Make transferring leader strictly after write commands that executes before - // proposing the locks, preventing unexpected lock loss. - fn propose_locks_before_transfer_leader( - &mut self, - ctx: &mut StoreContext, - msg: &eraftpb::Message, - ) -> bool { - // 1. Disable in-memory pessimistic locks. - - // Clone to make borrow checker happy when registering ticks. - let txn_ext = self.txn_ext().clone(); - let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); - - // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message - // is a reply to a transfer leader command before. If the locks status remain - // in the TransferringLeader status, we can safely initiate transferring leader - // now. - // If it's not in TransferringLeader status now, it is probably because several - // ticks have passed after proposing the locks in the last time and we - // reactivate the memory locks. Then, we should propose the locks again. - if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX - && pessimistic_locks.status == LocksStatus::TransferringLeader - { - return false; - } - - // If it is not writable, it's probably because it's a retried TransferLeader - // and the locks have been proposed. But we still need to return true to - // propose another TransferLeader command. Otherwise, some write requests that - // have marked some locks as deleted will fail because raft rejects more - // proposals. - // It is OK to return true here if it's in other states like MergingRegion or - // NotLeader. In those cases, the locks will fail to propose and nothing will - // happen. - if !pessimistic_locks.is_writable() { - return true; - } - pessimistic_locks.status = LocksStatus::TransferringLeader; - self.add_pending_tick(PeerTick::ReactivateMemoryLock); - - // 2. Propose pessimistic locks - if pessimistic_locks.is_empty() { - return false; - } - // FIXME: Raft command has size limit. Either limit the total size of - // pessimistic locks in a region, or split commands here. - let mut encoder = SimpleWriteEncoder::with_capacity(512); - let mut lock_count = 0; - { - // Downgrade to a read guard, do not block readers in the scheduler as far as - // possible. - let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); - fail_point!("invalidate_locks_before_transfer_leader"); - for (key, (lock, deleted)) in &*pessimistic_locks { - if *deleted { - continue; - } - lock_count += 1; - encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); - } - } - if lock_count == 0 { - // If the map is not empty but all locks are deleted, it is possible that a - // write command has just marked locks deleted but not proposed yet. - // It might cause that command to fail if we skip proposing the - // extra TransferLeader command here. - return true; - } - let mut header = Box::::default(); - header.set_region_id(self.region_id()); - header.set_region_epoch(self.region().get_region_epoch().clone()); - header.set_peer(self.peer().clone()); - info!( - self.logger, - "propose {} locks before transferring leader", lock_count; - ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; - self.on_simple_write(ctx, write.header, write.data, write.ch); - true - } } impl Apply { diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs index b330d0093fe..fd53090fd65 100644 --- a/components/raftstore-v2/src/operation/command/control.rs +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -181,6 +181,11 @@ impl ProposalControl { } } + #[inline] + pub fn has_uncommitted_admin(&self) -> bool { + !self.proposed_admin_cmd.is_empty() && !self.proposed_admin_cmd.back().unwrap().committed + } + pub fn advance_apply(&mut self, index: u64, term: u64, region: &metapb::Region) { while !self.proposed_admin_cmd.is_empty() { let cmd = self.proposed_admin_cmd.front_mut().unwrap(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 35b4ec1918e..edca9510c27 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -16,9 +16,9 @@ //! - Applied result are sent back to peer fsm, and update memory state in //! `on_apply_res`. -use std::mem; +use std::{mem, time::Duration}; -use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; +use engine_traits::{KvEngine, PerfContext, RaftEngine, WriteBatch, WriteOptions}; use kvproto::raft_cmdpb::{ AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, }; @@ -33,18 +33,23 @@ use raftstore::{ Proposal, }, local_metrics::RaftMetrics, + metrics::{APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM}, msg::ErrorCallback, - util, WriteCallback, + util, Config, WriteCallback, }, Error, Result, }; -use tikv_util::{box_err, time::monotonic_raw_now}; +use slog::{info, warn}; +use tikv_util::{ + box_err, slog_panic, + time::{duration_to_sec, monotonic_raw_now, Instant}, +}; use crate::{ batch::StoreContext, fsm::{ApplyFsm, ApplyResReporter}, raft::{Apply, Peer}, - router::{ApplyRes, ApplyTask, CmdResChannel}, + router::{ApplyRes, ApplyTask, CmdResChannel, PeerTick}, }; mod admin; @@ -52,7 +57,8 @@ mod control; mod write; pub use admin::{ - temp_split_path, AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, + temp_split_path, AdminCmdResult, CompactLogContext, RequestSplit, SplitFlowControl, SplitInit, + SPLIT_PREFIX, }; pub use control::ProposalControl; pub use write::{ @@ -65,12 +71,12 @@ fn parse_at(logger: &slog::Logger, buf: &[u8], index: u64, let mut m = M::default(); match m.merge_from_bytes(buf) { Ok(()) => m, - Err(e) => panic!( - "{:?} data is corrupted at [{}] {}: {:?}", - logger.list(), - term, - index, - e + Err(e) => slog_panic!( + logger, + "data is corrupted"; + "term" => term, + "index" => index, + "error" => ?e, ), } } @@ -80,6 +86,7 @@ pub struct CommittedEntries { /// Entries need to be applied. Note some entries may not be included for /// flow control. entry_and_proposals: Vec<(Entry, Vec)>, + committed_time: Instant, } fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { @@ -104,6 +111,7 @@ impl Peer { let logger = self.logger.clone(); let read_scheduler = self.storage().read_scheduler(); let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( + &store_ctx.cfg, self.peer().clone(), region_state, mailbox, @@ -111,6 +119,7 @@ impl Peer { read_scheduler, self.flush_state().clone(), self.storage().apply_trace().log_recovery(), + self.entry_storage().applied_term(), logger, ); @@ -212,12 +221,35 @@ impl Peer { } proposal.must_pass_epoch_check = self.applied_to_current_term(); proposal.propose_time = Some(*ctx.current_time.get_or_insert_with(monotonic_raw_now)); + self.report_batch_wait_duration(ctx, &proposal.cb); self.proposals_mut().push(proposal); self.set_has_ready(); } + fn report_batch_wait_duration( + &self, + ctx: &mut StoreContext, + ch: &Vec, + ) { + if !ctx.raft_metrics.waterfall_metrics || ch.is_empty() { + return; + } + let now = std::time::Instant::now(); + for c in ch { + for tracker in c.write_trackers() { + tracker.observe(now, &ctx.raft_metrics.wf_batch_wait, |t| { + &mut t.metrics.wf_batch_wait_nanos + }); + } + } + } + #[inline] - pub fn schedule_apply_committed_entries(&mut self, committed_entries: Vec) { + pub fn schedule_apply_committed_entries( + &mut self, + ctx: &mut StoreContext, + committed_entries: Vec, + ) { if committed_entries.is_empty() { return; } @@ -237,6 +269,7 @@ impl Peer { } else { entry_and_proposals = committed_entries.into_iter().map(|e| (e, vec![])).collect(); } + self.report_store_time_duration(ctx, &mut entry_and_proposals); // Unlike v1, v2 doesn't need to persist commit index and commit term. The // point of persist commit index/term of raft apply state is to recover commit // index when the writes to raft engine is lost but writes to kv engine is @@ -244,16 +277,42 @@ impl Peer { // memtables in kv engine is flushed. let apply = CommittedEntries { entry_and_proposals, + committed_time: Instant::now(), }; + assert!( + self.apply_scheduler().is_some(), + "apply_scheduler should be something. region_id {}", + self.region_id() + ); self.apply_scheduler() .unwrap() .send(ApplyTask::CommittedEntries(apply)); } + #[inline] + fn report_store_time_duration( + &mut self, + ctx: &mut StoreContext, + entry_and_proposals: &mut [(Entry, Vec)], + ) { + let now = std::time::Instant::now(); + for (_, chs) in entry_and_proposals { + for tracker in chs.write_trackers_mut() { + tracker.observe(now, &ctx.raft_metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + tracker.reset(now); + } + } + } + pub fn on_apply_res(&mut self, ctx: &mut StoreContext, apply_res: ApplyRes) { if !self.serving() { return; } + // TODO: remove following log once stable. + info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); // It must just applied a snapshot. if apply_res.applied_index < self.entry_storage().first_index() { // Ignore admin command side effects, otherwise it may split incomplete @@ -301,10 +360,63 @@ impl Peer { apply_res.applied_index, progress_to_be_updated, ); + if self.pause_for_recovery() + && self.storage().entry_storage().commit_index() <= apply_res.applied_index + { + info!(self.logger, "recovery completed"; "apply_index" => apply_res.applied_index); + self.set_pause_for_recovery(false); + // Flush to avoid recover again and again. + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + self.add_pending_tick(PeerTick::Raft); + } + if !self.pause_for_recovery() && self.storage_mut().apply_trace_mut().should_flush() { + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + } + let last_applying_index = self.compact_log_context().last_applying_index(); + let committed_index = self.entry_storage().commit_index(); + if last_applying_index < committed_index { + // We need to continue to apply after previous page is finished. + self.set_has_ready(); + } + } +} + +#[derive(Debug)] +pub struct ApplyFlowControl { + timer: Instant, + last_check_keys: u64, + need_flush: bool, + yield_time: Duration, + yield_written_bytes: u64, +} + +impl ApplyFlowControl { + pub fn new(cfg: &Config) -> Self { + ApplyFlowControl { + timer: Instant::now_coarse(), + last_check_keys: 0, + need_flush: false, + yield_time: cfg.apply_yield_duration.0, + yield_written_bytes: cfg.apply_yield_write_size.0, + } + } + + #[cfg(test)] + pub fn set_need_flush(&mut self, need_flush: bool) { + self.need_flush = need_flush; } } impl Apply { + #[inline] + pub fn on_start_apply(&mut self) { + self.apply_flow_control_mut().timer = Instant::now_coarse(); + } + #[inline] fn should_skip(&self, off: usize, index: u64) -> bool { let log_recovery = self.log_recovery(); @@ -340,11 +452,22 @@ impl Apply { } } } + self.apply_flow_control_mut().need_flush = true; + } + + pub async fn on_manual_flush(&mut self) { + let written_bytes = self.flush(); + if let Err(e) = self.tablet().flush_cfs(&[], false) { + warn!(self.logger, "failed to flush: {:?}", e); + } + self.maybe_reschedule(written_bytes).await } #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); + APPLY_TASK_WAIT_TIME_HISTOGRAM + .observe(duration_to_sec(ce.committed_time.saturating_elapsed())); for (e, ch) in ce.entry_and_proposals { if self.tombstone() { apply::notify_req_region_removed(self.region_state().get_region().get_id(), ch); @@ -375,6 +498,7 @@ impl Apply { } // Flush may be triggerred in the middle, so always update the index and term. self.set_apply_progress(e.index, e.term); + self.apply_flow_control_mut().need_flush = true; } } @@ -466,6 +590,7 @@ impl Apply { AdminCmdType::InvalidAdmin => { return Err(box_err!("invalid admin command type")); } + AdminCmdType::UpdateGcPeer => unimplemented!(), }; match admin_result { @@ -505,17 +630,59 @@ impl Apply { } } + fn should_reschedule(&self, written_bytes: u64) -> bool { + let control = self.apply_flow_control(); + written_bytes >= control.yield_written_bytes + || control.timer.saturating_elapsed() >= control.yield_time + } + + pub async fn maybe_reschedule(&mut self, written_bytes: u64) { + if self.should_reschedule(written_bytes) { + yatp::task::future::reschedule().await; + self.apply_flow_control_mut().timer = Instant::now_coarse(); + } + } + + /// Check whether it needs to flush. + /// + /// We always batch as much inputs as possible, flush will only be triggered + /// when it has been processing too long. + pub async fn maybe_flush(&mut self) { + let buffer_keys = self.metrics.written_keys; + let control = self.apply_flow_control_mut(); + if buffer_keys >= control.last_check_keys + 128 { + // Reschedule by write size was designed to avoid too many deletes impacts + // performance so it doesn't need pricise control. If checking bytes here may + // make the batch too small and hurt performance. + if self.should_reschedule(0) { + let written_bytes = self.flush(); + self.maybe_reschedule(written_bytes).await; + } else { + self.apply_flow_control_mut().last_check_keys = self.metrics.written_keys; + } + } + } + #[inline] - pub fn flush(&mut self) { + pub fn flush(&mut self) -> u64 { + // TODO: maybe we should check whether there is anything to flush. let (index, term) = self.apply_progress(); + let control = self.apply_flow_control_mut(); + control.last_check_keys = 0; + if !control.need_flush { + return 0; + } + control.need_flush = false; let flush_state = self.flush_state().clone(); - if let Some(wb) = &mut self.write_batch && !wb.is_empty() { + if let Some(wb) = &self.write_batch && !wb.is_empty() { + self.perf_context().start_observe(); let mut write_opt = WriteOptions::default(); write_opt.set_disable_wal(true); + let wb = self.write_batch.as_mut().unwrap(); if let Err(e) = wb.write_callback_opt(&write_opt, || { flush_state.set_applied_index(index); }) { - panic!("failed to write data: {:?}: {:?}", self.logger.list(), e); + slog_panic!(self.logger, "failed to write data"; "error" => ?e); } self.metrics.written_bytes += wb.data_size() as u64; self.metrics.written_keys += wb.count() as u64; @@ -524,13 +691,15 @@ impl Apply { } else { self.write_batch.take(); } - } - let callbacks = self.callbacks_mut(); - for (ch, resp) in callbacks.drain(..) { - ch.set_result(resp); - } - if callbacks.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { - callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); + let tokens: Vec<_> = self + .callbacks_mut() + .iter() + .flat_map(|(v, _)| { + v.write_trackers() + .flat_map(|t| t.as_tracker_token()) + }) + .collect(); + self.perf_context().report_metrics(&tokens); } let mut apply_res = ApplyRes::default(); apply_res.applied_index = index; @@ -538,6 +707,25 @@ impl Apply { apply_res.admin_result = self.take_admin_result().into_boxed_slice(); apply_res.modifications = *self.modifications_mut(); apply_res.metrics = mem::take(&mut self.metrics); + let written_bytes = apply_res.metrics.written_bytes; self.res_reporter().report(apply_res); + + // Report result first and then invoking callbacks. This may delays callback a + // little bit, but can make sure all following messages must see the side + // effect of admin commands. + let callbacks = self.callbacks_mut(); + let now = std::time::Instant::now(); + let apply_time = APPLY_TIME_HISTOGRAM.local(); + for (ch, resp) in callbacks.drain(..) { + for tracker in ch.write_trackers() { + tracker.observe(now, &apply_time, |t| &mut t.metrics.apply_time_nanos); + } + ch.set_result(resp); + } + apply_time.flush(); + if callbacks.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { + callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); + } + written_bytes } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index ad6e537b956..14011d6fc1b 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -11,6 +11,7 @@ use raftstore::{ }, Result, }; +use tikv_util::slog_panic; use crate::{ batch::StoreContext, @@ -150,13 +151,13 @@ impl Apply { .put_cf(cf, &self.key_buffer, value) }; res.unwrap_or_else(|e| { - panic!( - "{:?} failed to write ({}, {}) {}: {:?}", - self.logger.list(), - log_wrappers::Value::key(key), - log_wrappers::Value::value(value), - cf, - e + slog_panic!( + self.logger, + "failed to write"; + "key" => %log_wrappers::Value::key(key), + "value" => %log_wrappers::Value::value(value), + "cf" => cf, + "error" => ?e ); }); fail::fail_point!("APPLY_PUT", |_| Err(raftstore::Error::Other( @@ -177,6 +178,7 @@ impl Apply { } util::check_key_in_region(key, self.region_state().get_region())?; keys::data_key_with_buffer(key, &mut self.key_buffer); + self.ensure_write_buffer(); let res = if cf.is_empty() || cf == CF_DEFAULT { // TODO: use write_vector self.write_batch.as_mut().unwrap().delete(&self.key_buffer) @@ -187,12 +189,12 @@ impl Apply { .delete_cf(cf, &self.key_buffer) }; res.unwrap_or_else(|e| { - panic!( - "{:?} failed to delete {} {}: {:?}", - self.logger.list(), - log_wrappers::Value::key(key), - cf, - e + slog_panic!( + self.logger, + "failed to delete"; + "key" => %log_wrappers::Value::key(key), + "cf" => cf, + "error" => ?e ); }); self.metrics.size_diff_hint -= self.key_buffer.len() as i64; diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index 57c01fca9d8..e6f81b20af1 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -5,6 +5,7 @@ use kvproto::raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}; use protobuf::{CodedInputStream, Message}; use raftstore::store::WriteCallback; use slog::Logger; +use tikv_util::slog_panic; use crate::{operation::command::parse_at, router::CmdResChannel}; @@ -191,12 +192,12 @@ impl<'a> SimpleWriteReqDecoder<'a> { let mut is = CodedInputStream::from_bytes(&buf[1..]); let header = match is.read_message() { Ok(h) => h, - Err(e) => panic!( - "{:?} data corrupted at [{}] {}: {:?}", - logger.list(), - term, - index, - e + Err(e) => slog_panic!( + logger, + "data corrupted"; + "term" => term, + "index" => index, + "error" => ?e ), }; let read = is.pos(); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index ea42832eaea..3a9f678bd8c 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -10,8 +10,6 @@ //! sending a message to store fsm first, and then using split to initialized //! the peer. -use std::cmp; - use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; @@ -286,8 +284,13 @@ impl Peer { #[inline] pub fn postponed_destroy(&self) -> bool { let entry_storage = self.storage().entry_storage(); - // TODO: check actual split index instead of commit index. - entry_storage.applied_index() != entry_storage.commit_index() + // If it's marked as tombstone, then it must be changed by conf change. In + // this case, all following entries are skipped so applied_index never equals + // to commit_index. + (self.storage().region_state().get_state() != PeerState::Tombstone + && entry_storage.applied_index() != entry_storage.commit_index()) + // Wait for critical commands like split. + || self.has_pending_tombstone_tablets() } /// Start the destroy progress. It will write `Tombstone` state @@ -295,33 +298,29 @@ impl Peer { /// /// After destroy is finished, `finish_destroy` should be called to clean up /// memory states. - pub fn start_destroy(&mut self, write_task: &mut WriteTask) { - let entry_storage = self.storage().entry_storage(); + pub fn start_destroy( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + ) { if self.postponed_destroy() { return; } - let first_index = entry_storage.first_index(); - let last_index = entry_storage.last_index(); - if first_index <= last_index { - write_task.cut_logs = match write_task.cut_logs { - None => Some((first_index, last_index)), - Some((f, l)) => Some((cmp::min(first_index, f), cmp::max(last_index, l))), - }; - } let raft_engine = self.entry_storage().raft_engine(); let mut region_state = self.storage().region_state().clone(); let region_id = region_state.get_region().get_id(); + // Use extra write to ensure these writes are the last writes to raft engine. let lb = write_task .extra_write .ensure_v2(|| raft_engine.log_batch(2)); - // We only use raft-log-engine for v2, first index is not important. + // We only use raft-log-engine for v2, first index and state are not important. let raft_state = self.entry_storage().raft_state(); raft_engine.clean(region_id, 0, raft_state, lb).unwrap(); - // Write worker will do the clean up when meeting tombstone state. region_state.set_state(PeerState::Tombstone); let applied_index = self.entry_storage().applied_index(); lb.put_region_state(region_id, applied_index, ®ion_state) .unwrap(); + self.record_tombstone_tablet_for_destroy(ctx, write_task); self.destroy_progress_mut().start(); } @@ -330,7 +329,13 @@ impl Peer { /// memory states. pub fn finish_destroy(&mut self, ctx: &mut StoreContext) { info!(self.logger, "peer destroyed"); - ctx.router.close(self.region_id()); + let region_id = self.region_id(); + ctx.router.close(region_id); + { + let mut meta = ctx.store_meta.lock().unwrap(); + meta.remove_region(region_id); + meta.readers.remove(®ion_id); + } if let Some(msg) = self.destroy_progress_mut().finish() { // The message will be dispatched to store fsm, which will create a // new peer. Ignore error as it's just a best effort. diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index c49a14142ce..76baf31f9c8 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -5,11 +5,12 @@ mod life; mod pd; mod query; mod ready; +mod txn_ext; pub use command::{ - AdminCmdResult, CommittedEntries, ProposalControl, RequestSplit, SimpleWriteBinary, - SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, - SPLIT_PREFIX, + AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, ProposalControl, + RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, + SimpleWriteReqEncoder, SplitFlowControl, SPLIT_PREFIX, }; pub use life::DestroyProgress; pub use ready::{ @@ -17,4 +18,8 @@ pub use ready::{ StateStorage, }; -pub(crate) use self::{command::SplitInit, query::LocalReader}; +pub(crate) use self::{ + command::SplitInit, + query::{LocalReader, ReadDelegatePair, SharedReadTablet}, + txn_ext::TxnContext, +}; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 894f39f278b..17abdd85cf0 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -7,6 +7,7 @@ use fail::fail_point; use kvproto::{metapb, pdpb}; use raftstore::store::Transport; use slog::error; +use tikv_util::slog_panic; use crate::{ batch::StoreContext, @@ -49,9 +50,7 @@ impl Store { stats.set_bytes_written(0); stats.set_keys_written(0); stats.set_is_busy(false); - - // stats.set_query_stats(query_stats); - + // TODO: add query stats let task = pd::Task::StoreHeartbeat { stats }; if let Err(e) = ctx.schedulers.pd.schedule(task) { error!(self.logger(), "notify pd failed"; @@ -76,7 +75,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, impl Peer { #[inline] - pub fn region_heartbeat_pd(&self, ctx: &StoreContext) { + pub fn region_heartbeat_pd(&mut self, ctx: &StoreContext) { let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { term: self.term(), region: self.region().clone(), @@ -85,8 +84,8 @@ impl Peer { pending_peers: self.collect_pending_peers(ctx), written_bytes: self.self_stat().written_bytes, written_keys: self.self_stat().written_keys, - approximate_size: None, - approximate_keys: None, + approximate_size: self.split_flow_control_mut().approximate_size(), + approximate_keys: self.split_flow_control_mut().approximate_keys(), wait_data_peers: Vec::new(), }); if let Err(e) = ctx.schedulers.pd.schedule(task) { @@ -137,10 +136,10 @@ impl Peer { pending_peers.push(p); } else { if ctx.cfg.dev_assert { - panic!( - "{:?} failed to get peer {} from cache", - self.logger.list(), - id + slog_panic!( + self.logger, + "failed to get peer from cache"; + "get_peer_id" => id ); } error!( @@ -206,20 +205,4 @@ impl Peer { ); } } - - #[inline] - pub fn update_max_timestamp_pd(&self, ctx: &StoreContext, initial_status: u64) { - let task = pd::Task::UpdateMaxTimestamp { - region_id: self.region_id(), - initial_status, - txn_ext: self.txn_ext().clone(), - }; - if let Err(e) = ctx.schedulers.pd.schedule(task) { - error!( - self.logger, - "failed to notify pd with UpdateMaxTimestamp"; - "err" => %e, - ); - } - } } diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index ca92729ee6f..3185f1bd24b 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -112,7 +112,7 @@ impl Peer { let time = monotonic_raw_now(); for (_, ch, mut read_index) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) .to_std() .unwrap() @@ -150,7 +150,7 @@ impl Peer { pub(crate) fn maybe_renew_leader_lease( &mut self, ts: Timespec, - store_meta: &Mutex, + store_meta: &Mutex>, progress: Option, ) { // A nonleader peer should never has leader lease. @@ -170,12 +170,12 @@ impl Peer { }; if let Some(progress) = progress { let mut meta = store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } if let Some(progress) = read_progress { let mut meta = store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } } diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 2cb5497d789..f574571f790 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -2,13 +2,14 @@ // #[PerformanceCriticalPath] use std::{ + num::NonZeroU64, ops::Deref, sync::{atomic, Arc, Mutex}, }; use batch_system::Router; use crossbeam::channel::TrySendError; -use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine}; use futures::Future; use kvproto::{ errorpb, @@ -20,10 +21,9 @@ use raftstore::{ cmd_resp, util::LeaseState, worker_metrics::{self, TLS_LOCAL_READ_METRICS}, - LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, ReadExecutorProvider, - RegionSnapshot, RequestPolicy, + LocalReaderCore, ReadDelegate, ReadExecutorProvider, RegionSnapshot, }, - Error, Result, + Result, }; use slog::{debug, Logger}; use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; @@ -50,6 +50,87 @@ where } } +pub type ReadDelegatePair = (ReadDelegate, SharedReadTablet); + +/// A share struct for local reader. +/// +/// Though it looks like `CachedTablet`, but there are subtle differences. +/// 1. `CachedTablet` always hold the latest version of the tablet. But +/// `SharedReadTablet` should only hold the tablet that matches epoch. So it +/// will be updated only when the epoch is updated. +/// 2. `SharedReadTablet` should always hold a tablet and the same tablet. If +/// tablet is taken, then it should be considered as stale and should check +/// again epoch to load the new `SharedReadTablet`. +/// 3. `SharedReadTablet` may be cloned into thread local. So its cache should +/// be released as soon as possible, so there should be no strong reference +/// that prevents tablet from being dropped after it's marked as stale by other +/// threads. +pub struct SharedReadTablet { + tablet: Arc>>, + cache: Option, + source: bool, +} + +impl SharedReadTablet { + pub fn new(tablet: EK) -> Self { + Self { + tablet: Arc::new(Mutex::new(Some(tablet))), + cache: None, + source: true, + } + } + + /// Should call `fill_cache` first. + pub fn cache(&self) -> &EK { + self.cache.as_ref().unwrap() + } + + pub fn fill_cache(&mut self) -> bool + where + EK: Clone, + { + self.cache = self.tablet.lock().unwrap().clone(); + self.cache.is_some() + } + + pub fn release(&mut self) { + self.cache = None; + } +} + +impl Clone for SharedReadTablet { + fn clone(&self) -> Self { + Self { + tablet: Arc::clone(&self.tablet), + cache: None, + source: false, + } + } +} + +impl Drop for SharedReadTablet { + fn drop(&mut self) { + if self.source { + self.tablet.lock().unwrap().take(); + } + } +} + +enum ReadResult { + Ok(T), + Redirect, + RetryForStaleDelegate, + Err(E), +} + +fn fail_resp(msg: String) -> RaftCmdResponse { + let mut err = errorpb::Error::default(); + err.set_message(msg); + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + resp +} + #[derive(Clone)] pub struct LocalReader where @@ -67,63 +148,69 @@ where E: KvEngine, C: MsgRouter, { - pub fn new( - store_meta: Arc>, - reg: TabletRegistry, - router: C, - logger: Logger, - ) -> Self { + pub fn new(store_meta: Arc>>, router: C, logger: Logger) -> Self { Self { - local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta, reg)), + local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta)), router, logger, } } - pub fn store_meta(&self) -> &Arc> { + pub fn store_meta(&self) -> &Arc>> { &self.local_reader.store_meta().store_meta } - pub fn pre_propose_raft_command( + fn pre_propose_raft_command( &mut self, req: &RaftCmdRequest, - ) -> Result, RequestPolicy)>> { - if let Some(delegate) = self.local_reader.validate_request(req)? { - let mut inspector = SnapRequestInspector { - delegate: &delegate, - logger: &self.logger, - }; - match inspector.inspect(req) { - Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), - Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), - // It can not handle other policies. - // TODO: we should only abort when lease expires. For other cases we should retry - // infinitely. - Ok(_) => Ok(None), - Err(e) => Err(e), + ) -> ReadResult<(CachedReadDelegate, ReadRequestPolicy)> { + let mut delegate = match self.local_reader.validate_request(req) { + Ok(Some(delegate)) => delegate, + Ok(None) => return ReadResult::Redirect, + Err(e) => return ReadResult::Err(e), + }; + + if !delegate.cached_tablet.fill_cache() { + return ReadResult::RetryForStaleDelegate; + } + let mut inspector = SnapRequestInspector { + delegate: &delegate, + logger: &self.logger, + }; + match inspector.inspect(req) { + Ok(ReadRequestPolicy::ReadLocal) => { + ReadResult::Ok((delegate, ReadRequestPolicy::ReadLocal)) } - } else { - Err(Error::RegionNotFound(req.get_header().get_region_id())) + Ok(ReadRequestPolicy::StaleRead) => { + ReadResult::Ok((delegate, ReadRequestPolicy::StaleRead)) + } + // It can not handle other policies. + // TODO: we should only abort when lease expires. For other cases we should retry + // infinitely. + Ok(ReadRequestPolicy::ReadIndex) => ReadResult::Redirect, + Err(e) => ReadResult::Err(e), } } fn try_get_snapshot( &mut self, req: &RaftCmdRequest, - ) -> std::result::Result>, RaftCmdResponse> { + ) -> ReadResult, RaftCmdResponse> { match self.pre_propose_raft_command(req) { - Ok(Some((mut delegate, policy))) => { + ReadResult::Ok((mut delegate, policy)) => { let mut snap = match policy { - RequestPolicy::ReadLocal => { + ReadRequestPolicy::ReadLocal => { let region = Arc::clone(&delegate.region); - let snap = - RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); // Ensures the snapshot is acquired before getting the time atomic::fence(atomic::Ordering::Release); let snapshot_ts = monotonic_raw_now(); if !delegate.is_in_leader_lease(snapshot_ts) { - return Ok(None); + return ReadResult::Redirect; } TLS_LOCAL_READ_METRICS @@ -133,18 +220,24 @@ where self.maybe_renew_lease_in_advance(&delegate, req, snapshot_ts); snap } - RequestPolicy::StaleRead => { + ReadRequestPolicy::StaleRead => { let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - delegate.check_stale_read_safe(read_ts)?; + if let Err(e) = delegate.check_stale_read_safe(read_ts) { + return ReadResult::Err(e); + } let region = Arc::clone(&delegate.region); - let snap = - RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); TLS_LOCAL_READ_METRICS .with(|m| m.borrow_mut().local_executed_requests.inc()); - delegate.check_stale_read_safe(read_ts)?; + if let Err(e) = delegate.check_stale_read_safe(read_ts) { + return ReadResult::Err(e); + } TLS_LOCAL_READ_METRICS .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); @@ -154,12 +247,15 @@ where }; snap.txn_ext = Some(delegate.txn_ext.clone()); + snap.term = NonZeroU64::new(delegate.term); + snap.txn_extra_op = delegate.txn_extra_op.load(); snap.bucket_meta = delegate.bucket_meta.clone(); - Ok(Some(snap)) + delegate.cached_tablet.release(); + + ReadResult::Ok(snap) } - Ok(None) => Ok(None), - Err(e) => { + ReadResult::Err(e) => { let mut response = cmd_resp::new_error(e); if let Some(delegate) = self .local_reader @@ -168,8 +264,10 @@ where { cmd_resp::bind_term(&mut response, delegate.term); } - Err(response) + ReadResult::Err(response) } + ReadResult::Redirect => ReadResult::Redirect, + ReadResult::RetryForStaleDelegate => ReadResult::RetryForStaleDelegate, } } @@ -179,50 +277,85 @@ where ) -> impl Future, RaftCmdResponse>> + Send { let region_id = req.header.get_ref().region_id; - let res = match self.try_get_snapshot(&req) { - res @ (Ok(Some(_)) | Err(_)) => Either::Left(res), - Ok(None) => Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())), + let mut tried_cnt = 0; + let res = loop { + let res = self.try_get_snapshot(&req); + match res { + ReadResult::Ok(snap) => break Either::Left(Ok(snap)), + ReadResult::Err(e) => break Either::Left(Err(e)), + ReadResult::Redirect => { + break Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())); + } + ReadResult::RetryForStaleDelegate => { + tried_cnt += 1; + if tried_cnt < 10 { + continue; + } + break Either::Left(Err(fail_resp(format!( + "internal error: failed to get valid dalegate for {}", + region_id + )))); + } + } }; worker_metrics::maybe_tls_local_read_metrics_flush(); async move { - match res { - Either::Left(Ok(Some(snap))) => Ok(snap), - Either::Left(Err(e)) => Err(e), - Either::Right((fut, mut reader)) => { - let err = match fut.await? { - Some(query_res) => { - if query_res.read().is_some() { - // If query successful, try again. - req.mut_header().set_read_quorum(false); - if let Some(snap) = reader.try_get_snapshot(&req)? { - return Ok(snap); - } else { - let mut err = errorpb::Error::default(); - err.set_message(format!("no delegate found for {}", region_id)); - err - } - } else { - let QueryResult::Response(res) = query_res else { unreachable!() }; - assert!(res.get_header().has_error(), "{:?}", res); - return Err(res); + let (mut fut, mut reader) = match res { + Either::Left(Ok(snap)) => return Ok(snap), + Either::Left(Err(e)) => return Err(e), + Either::Right((fut, reader)) => (fut, reader), + }; + + let mut tried_cnt = 0; + loop { + match fut.await? { + Some(query_res) => { + if query_res.read().is_none() { + let QueryResult::Response(res) = query_res else { unreachable!() }; + assert!(res.get_header().has_error(), "{:?}", res); + return Err(res); + } + } + None => { + return Err(fail_resp(format!( + "internal error: failed to extend lease: canceled: {}", + region_id + ))); + } + } + + // If query successful, try again. + req.mut_header().set_read_quorum(false); + loop { + let r = reader.try_get_snapshot(&req); + match r { + ReadResult::Ok(snap) => return Ok(snap), + ReadResult::Err(e) => return Err(e), + ReadResult::Redirect => { + tried_cnt += 1; + if tried_cnt < 10 { + fut = reader.try_to_renew_lease(region_id, &req); + break; } + return Err(fail_resp(format!( + "internal error: can't handle msg in local reader for {}", + region_id + ))); } - None => { - let mut err = errorpb::Error::default(); - err.set_message(format!( - "failed to extend lease: canceled: {}", + ReadResult::RetryForStaleDelegate => { + tried_cnt += 1; + if tried_cnt < 10 { + continue; + } + return Err(fail_resp(format!( + "internal error: failed to get valid dalegate for {}", region_id - )); - err + ))); } - }; - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - Err(resp) + } } - Either::Left(Ok(None)) => unreachable!(), } } } @@ -309,7 +442,7 @@ where // The reason for this to be Arc, see the comment on get_delegate in // raftstore/src/store/worker/read.rs delegate: Arc, - cached_tablet: CachedTablet, + cached_tablet: SharedReadTablet, } impl Deref for CachedReadDelegate @@ -335,36 +468,20 @@ where } } -impl ReadExecutor for CachedReadDelegate -where - E: KvEngine, -{ - type Tablet = E; - - fn get_tablet(&mut self) -> &E { - self.cached_tablet.latest().unwrap() - } - - fn get_snapshot(&mut self, _: &Option>) -> Arc { - Arc::new(self.cached_tablet.latest().unwrap().snapshot()) - } -} - #[derive(Clone)] struct StoreMetaDelegate where E: KvEngine, { - store_meta: Arc>, - reg: TabletRegistry, + store_meta: Arc>>, } impl StoreMetaDelegate where E: KvEngine, { - pub fn new(store_meta: Arc>, reg: TabletRegistry) -> StoreMetaDelegate { - StoreMetaDelegate { store_meta, reg } + pub fn new(store_meta: Arc>>) -> StoreMetaDelegate { + StoreMetaDelegate { store_meta } } } @@ -373,7 +490,7 @@ where E: KvEngine, { type Executor = CachedReadDelegate; - type StoreMeta = Arc>; + type StoreMeta = Arc>>; fn store_id(&self) -> Option { Some(self.store_meta.as_ref().lock().unwrap().store_id) @@ -384,14 +501,13 @@ where fn get_executor_and_len(&self, region_id: u64) -> (usize, Option) { let meta = self.store_meta.as_ref().lock().unwrap(); let reader = meta.readers.get(®ion_id).cloned(); - if let Some(reader) = reader { + if let Some((reader, read_tablet)) = reader { // If reader is not None, cache must not be None. - let cached_tablet = self.reg.get(region_id).unwrap(); return ( meta.readers.len(), Some(CachedReadDelegate { delegate: Arc::new(reader), - cached_tablet, + cached_tablet: read_tablet, }), ); } @@ -399,13 +515,19 @@ where } } +enum ReadRequestPolicy { + StaleRead, + ReadLocal, + ReadIndex, +} + struct SnapRequestInspector<'r> { delegate: &'r ReadDelegate, logger: &'r Logger, } impl<'r> SnapRequestInspector<'r> { - fn inspect(&mut self, req: &RaftCmdRequest) -> Result { + fn inspect(&mut self, req: &RaftCmdRequest) -> Result { assert!(!req.has_admin_request()); if req.get_requests().len() != 1 || req.get_requests().first().unwrap().get_cmd_type() != CmdType::Snap @@ -417,26 +539,26 @@ impl<'r> SnapRequestInspector<'r> { let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { - return Ok(RequestPolicy::StaleRead); + return Ok(ReadRequestPolicy::StaleRead); } if req.get_header().get_read_quorum() { - return Ok(RequestPolicy::ReadIndex); + return Ok(ReadRequestPolicy::ReadIndex); } // If applied index's term differs from current raft's term, leader transfer // must happened, if read locally, we may read old value. if !self.has_applied_to_current_term() { - return Ok(RequestPolicy::ReadIndex); + return Ok(ReadRequestPolicy::ReadIndex); } // Local read should be performed, if and only if leader is in lease. // None for now. match self.inspect_lease() { - LeaseState::Valid => Ok(RequestPolicy::ReadLocal), + LeaseState::Valid => Ok(ReadRequestPolicy::ReadLocal), LeaseState::Expired | LeaseState::Suspect => { // Perform a consistent read to Raft quorum and try to renew the leader lease. - Ok(RequestPolicy::ReadIndex) + Ok(ReadRequestPolicy::ReadIndex) } } } @@ -480,12 +602,13 @@ mod tests { thread::{self, JoinHandle}, }; + use collections::HashSet; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, kv::{KvTestEngine, TestTabletFactory}, }; - use engine_traits::{MiscExt, Peekable, SyncMutable, TabletContext, DATA_CFS}; + use engine_traits::{MiscExt, SyncMutable, TabletContext, TabletRegistry, DATA_CFS}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; use pd_client::BucketMeta; @@ -505,17 +628,27 @@ mod tests { #[derive(Clone)] struct MockRouter { p_router: SyncSender<(u64, PeerMsg)>, + addresses: Arc>>, } impl MockRouter { - fn new() -> (MockRouter, Receiver<(u64, PeerMsg)>) { + fn new(addresses: Arc>>) -> (MockRouter, Receiver<(u64, PeerMsg)>) { let (p_ch, p_rx) = sync_channel(1); - (MockRouter { p_router: p_ch }, p_rx) + ( + MockRouter { + p_router: p_ch, + addresses, + }, + p_rx, + ) } } impl MsgRouter for MockRouter { fn send(&self, addr: u64, cmd: PeerMsg) -> std::result::Result<(), TrySendError> { + if !self.addresses.lock().unwrap().contains(&addr) { + return Err(TrySendError::Disconnected(cmd)); + } self.p_router.send((addr, cmd)).unwrap(); Ok(()) } @@ -524,16 +657,15 @@ mod tests { #[allow(clippy::type_complexity)] fn new_reader( store_id: u64, - store_meta: Arc>, - reg: TabletRegistry, + store_meta: Arc>>, + addresses: Arc>>, ) -> ( LocalReader, Receiver<(u64, PeerMsg)>, ) { - let (ch, rx) = MockRouter::new(); + let (ch, rx) = MockRouter::new(addresses); let mut reader = LocalReader::new( store_meta, - reg, ch, Logger::root(slog::Discard, o!("key1" => "value1")), ); @@ -607,7 +739,8 @@ mod tests { let reg = TabletRegistry::new(factory, path.path()).unwrap(); let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); - let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), reg.clone()); + let addresses: Arc>> = Arc::default(); + let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), addresses.clone()); let (mix_tx, mix_rx) = sync_channel(1); let handler = mock_raftstore(mix_rx); @@ -649,9 +782,11 @@ mod tests { ); // No msg will ben sent rx.try_recv().unwrap_err(); + // It will be rejected first when processing local, and then rejected when + // trying to forward to raftstore. assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.no_region.get()), - 1 + 2 ); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), @@ -680,13 +815,15 @@ mod tests { txn_ext: txn_ext.clone(), read_progress: read_progress.clone(), pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: Some(bucket_meta.clone()), }; - meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data let ctx = TabletContext::new(®ion1, Some(10)); - reg.load(ctx, true).unwrap(); + let mut tablet = reg.load(ctx, true).unwrap(); + let shared = SharedReadTablet::new(tablet.latest().unwrap().clone()); + meta.readers.insert(1, (read_delegate, shared)); } let (ch_tx, ch_rx) = sync_channel(1); @@ -701,6 +838,7 @@ mod tests { meta.readers .get_mut(&1) .unwrap() + .0 .update(ReadProgress::applied_term(term6)); }), rx, @@ -710,6 +848,7 @@ mod tests { // The first try will be rejected due to unmatched applied term but after update // the applied term by the above thread, the snapshot will be acquired by // retrying. + addresses.lock().unwrap().insert(1); let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); assert!(Arc::ptr_eq(snap.txn_ext.as_ref().unwrap(), &txn_ext)); assert!(Arc::ptr_eq( @@ -730,14 +869,16 @@ mod tests { // Case: Expire lease to make the local reader lease check fail. lease.expire_remote_lease(); let remote = lease.maybe_new_remote_lease(term6).unwrap(); + let meta = store_meta.clone(); // Send what we want to do to mock raftstore mix_tx .send(( Box::new(move || { - let mut meta = store_meta.lock().unwrap(); + let mut meta = meta.lock().unwrap(); meta.readers .get_mut(&1) .unwrap() + .0 .update(ReadProgress::leader_lease(remote)); }), rx, @@ -757,6 +898,25 @@ mod tests { ); rx = ch_rx.recv().unwrap(); + // Case: Tablet miss should triger retry. + { + let ctx = TabletContext::new(®ion1, Some(15)); + let mut tablet = reg.load(ctx, true).unwrap(); + let shared = SharedReadTablet::new(tablet.latest().unwrap().clone()); + let mut meta = store_meta.lock().unwrap(); + meta.readers.get_mut(&1).unwrap().1 = shared; + } + block_on(reader.snapshot(cmd.clone())).unwrap(); + // Tablet miss should trigger reload tablet, so cache miss should increase. + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 6 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), + 1 + ); + // Case: Read quorum. let mut cmd_read_quorum = cmd.clone(); cmd_read_quorum.mut_header().set_read_quorum(true); @@ -789,6 +949,7 @@ mod tests { assert_eq!(read_progress.safe_ts(), 2); let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); assert_eq!(*snap.get_region(), region1); + assert_eq!(snap.term, NonZeroU64::new(term6)); drop(mix_tx); handler.join().unwrap(); @@ -806,8 +967,7 @@ mod tests { let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); let reg = TabletRegistry::new(factory, path.path()).unwrap(); - let store_meta = - StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1))), reg.clone()); + let store_meta = StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1)))); let tablet1; let tablet2; @@ -816,43 +976,46 @@ mod tests { // Create read_delegate with region id 1 let read_delegate = ReadDelegate::mock(1); - meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data let mut ctx = TabletContext::with_infinite_region(1, Some(10)); reg.load(ctx, true).unwrap(); tablet1 = reg.get(1).unwrap().latest().unwrap().clone(); tablet1.put(b"a1", b"val1").unwrap(); + let shared1 = SharedReadTablet::new(tablet1.clone()); + meta.readers.insert(1, (read_delegate, shared1)); // Create read_delegate with region id 2 let read_delegate = ReadDelegate::mock(2); - meta.readers.insert(2, read_delegate); // create tablet with region_id 1 and prepare some data ctx = TabletContext::with_infinite_region(2, Some(10)); reg.load(ctx, true).unwrap(); tablet2 = reg.get(2).unwrap().latest().unwrap().clone(); tablet2.put(b"a2", b"val2").unwrap(); + let shared2 = SharedReadTablet::new(tablet2.clone()); + meta.readers.insert(2, (read_delegate, shared2)); } let (_, delegate) = store_meta.get_executor_and_len(1); let mut delegate = delegate.unwrap(); - let tablet = delegate.get_tablet(); + assert!(delegate.cached_tablet.fill_cache()); + let tablet = delegate.cached_tablet.cache(); assert_eq!(tablet1.path(), tablet.path()); - let snapshot = delegate.get_snapshot(&None); - assert_eq!( - b"val1".to_vec(), - *snapshot.get_value(b"a1").unwrap().unwrap() - ); + let path1 = tablet.path().to_owned(); + delegate.cached_tablet.release(); let (_, delegate) = store_meta.get_executor_and_len(2); let mut delegate = delegate.unwrap(); - let tablet = delegate.get_tablet(); + assert!(delegate.cached_tablet.fill_cache()); + let tablet = delegate.cached_tablet.cache(); assert_eq!(tablet2.path(), tablet.path()); - let snapshot = delegate.get_snapshot(&None); - assert_eq!( - b"val2".to_vec(), - *snapshot.get_value(b"a2").unwrap().unwrap() - ); + + assert!(KvTestEngine::locked(&path1).unwrap()); + drop(tablet1); + drop(reg); + assert!(KvTestEngine::locked(&path1).unwrap()); + store_meta.store_meta.lock().unwrap().readers.remove(&1); + assert!(!KvTestEngine::locked(&path1).unwrap()); } } diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 59c6f2d0f7c..305cdb666cc 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -30,7 +30,7 @@ use raftstore::{ Error, Result, }; use slog::{debug, info}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; use crate::{ @@ -46,7 +46,7 @@ mod lease; mod local; mod replica; -pub(crate) use self::local::LocalReader; +pub(crate) use self::local::{LocalReader, ReadDelegatePair, SharedReadTablet}; impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> PeerFsmDelegate<'a, EK, ER, T> @@ -363,7 +363,10 @@ impl Peer { } } StatusCmdType::InvalidStatus => { - return Err(box_err!("{:?} invalid status command!", self.logger.list())); + return Err(box_err!( + "{} invalid status command!", + SlogFormat(&self.logger) + )); } } @@ -436,7 +439,7 @@ impl Peer { } let progress = ReadProgress::applied_term(applied_term); let mut meta = ctx.store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } } diff --git a/components/raftstore-v2/src/operation/query/replica.rs b/components/raftstore-v2/src/operation/query/replica.rs index fb00adbbc5a..901fd9726f6 100644 --- a/components/raftstore-v2/src/operation/query/replica.rs +++ b/components/raftstore-v2/src/operation/query/replica.rs @@ -75,7 +75,7 @@ impl Peer { let time = monotonic_raw_now(); for (req, ch, _) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) .to_std() .unwrap() diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index d6a83b7933b..67bbed5aa4b 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -40,8 +40,8 @@ use kvproto::{ use raftstore::store::{ ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::Logger; -use tikv_util::{box_err, worker::Scheduler}; +use slog::{info, trace, Logger}; +use tikv_util::{box_err, slog_panic, worker::Scheduler}; use crate::{ operation::{ @@ -130,10 +130,11 @@ impl engine_traits::StateStorage for StateStorage< /// Mapping from data cf to an u64 index. pub type DataTrace = [u64; DATA_CFS_LEN]; -#[derive(Clone, Copy, Default)] +#[derive(Clone, Copy, Default, Debug)] struct Progress { flushed: u64, - /// The index of last entry that has modification to the CF. + /// The index of last entry that has modification to the CF. The value + /// can be larger than the index that actually modifies the CF in apply. /// /// If `flushed` == `last_modified`, then all data in the CF is persisted. last_modified: u64, @@ -154,7 +155,7 @@ pub fn cf_offset(cf: &str) -> usize { /// interact with other peers will be traced. /// - support query the flushed progress without actually scanning raft engine, /// which is useful for cleaning up stale flush records. -#[derive(Default)] +#[derive(Default, Debug)] pub struct ApplyTrace { /// The modified indexes and flushed index of each data CF. data_cfs: Box<[Progress; DATA_CFS_LEN]>, @@ -168,6 +169,10 @@ pub struct ApplyTrace { admin: Progress, /// Index that is issued to be written. It may not be truely persisted. persisted_applied: u64, + /// Flush will be triggered explicitly when there are too many pending + /// writes. It marks the last index that is flushed to avoid too many + /// flushes. + last_flush_trigger: u64, /// `true` means the raft cf record should be persisted in next ready. try_persist: bool, } @@ -187,9 +192,14 @@ impl ApplyTrace { trace.admin.flushed = i; trace.admin.last_modified = i; trace.persisted_applied = i; - let applied_region_state = engine - .get_region_state(region_id, trace.admin.flushed)? - .unwrap(); + trace.last_flush_trigger = i; + let applied_region_state = match engine.get_region_state(region_id, trace.admin.flushed)? { + Some(s) => s, + None => panic!( + "failed to get region state [region_id={}] [apply_trace={:?}]", + region_id, trace + ), + }; Ok((trace, applied_region_state)) } @@ -218,7 +228,31 @@ impl ApplyTrace { } pub fn persisted_apply_index(&self) -> u64 { - self.admin.flushed + self.persisted_applied + } + + pub fn should_flush(&mut self) -> bool { + if self.admin.flushed < self.admin.last_modified { + // It's waiting for other peers, flush will not help. + return false; + } + let last_modified = self + .data_cfs + .iter() + .filter_map(|pr| { + if pr.last_modified != pr.flushed { + Some(pr.last_modified) + } else { + None + } + }) + .max(); + if let Some(m) = last_modified && m >= self.admin.flushed + 4096000 && m >= self.last_flush_trigger + 4096000 { + self.last_flush_trigger = m; + true + } else { + false + } } // All events before `mem_index` must be consumed before calling this function. @@ -228,10 +262,17 @@ impl ApplyTrace { } let min_flushed = self .data_cfs - .iter() + .iter_mut() // Only unflushed CFs are considered. Flushed CF always have uptodate changes // persisted. .filter_map(|pr| { + // All modifications before mem_index must be seen. If following condition is + // true, it means the modification comes beyond general apply process (like + // transaction GC unsafe write). Align `last_modified` to `flushed` to avoid + // blocking raft log GC. + if mem_index >= pr.flushed && pr.flushed > pr.last_modified { + pr.last_modified = pr.flushed; + } if pr.last_modified != pr.flushed { Some(pr.flushed) } else { @@ -272,19 +313,24 @@ impl ApplyTrace { None } - pub fn reset_snapshot(&mut self, index: u64) { + pub fn restore_snapshot(&mut self, index: u64) { for pr in self.data_cfs.iter_mut() { - pr.flushed = index; pr.last_modified = index; } - self.admin.flushed = index; + self.admin.last_modified = index; + // Snapshot is a special case that KVs are not flushed yet, so all flushed + // state should not be changed. But persisted_applied is updated whenever an + // asynchronous write is triggered. So it can lead to a special case that + // persisted_applied < admin.flushed. It seems no harm ATM though. self.persisted_applied = index; self.try_persist = false; } - #[inline] - pub fn reset_should_persist(&mut self) { - self.try_persist = false; + pub fn on_applied_snapshot(&mut self, index: u64) { + for pr in self.data_cfs.iter_mut() { + pr.flushed = index; + } + self.admin.flushed = index; } #[inline] @@ -415,11 +461,10 @@ impl Storage { return; } } - panic!( - "{:?} data loss detected: {}_{} not found", - self.logger().list(), - region_id, - tablet_index + slog_panic!( + self.logger(), + "tablet loss detected"; + "tablet_index" => tablet_index ); } @@ -440,12 +485,18 @@ impl Storage { } pub fn record_apply_trace(&mut self, write_task: &mut WriteTask) { + let trace = self.apply_trace(); + // Maybe tablet index can be different? + if trace.persisted_applied > trace.admin.flushed { + return; + } let region_id = self.region().get_id(); let raft_engine = self.entry_storage().raft_engine(); let tablet_index = self.tablet_index(); let lb = write_task .extra_write .ensure_v2(|| raft_engine.log_batch(1)); + info!(self.logger(), "persisting admin flushed"; "tablet_index" => tablet_index, "flushed" => trace.admin.flushed); let trace = self.apply_trace_mut(); lb.put_flushed_index(region_id, CF_RAFT, tablet_index, trace.admin.flushed) .unwrap(); @@ -456,6 +507,7 @@ impl Storage { impl Peer { pub fn on_data_flushed(&mut self, cf: &str, tablet_index: u64, index: u64) { + trace!(self.logger, "data flushed"; "cf" => cf, "tablet_index" => tablet_index, "index" => index, "trace" => ?self.storage().apply_trace()); if tablet_index < self.storage().tablet_index() { // Stale tablet. return; @@ -467,6 +519,7 @@ impl Peer { } pub fn on_data_modified(&mut self, modification: DataTrace) { + trace!(self.logger, "on data modified"; "modification" => ?modification, "trace" => ?self.storage().apply_trace()); let apply_index = self.storage().entry_storage().applied_index(); let apply_trace = self.storage_mut().apply_trace_mut(); for (cf, index) in DATA_CFS.iter().zip(modification) { @@ -556,22 +609,22 @@ mod tests { #[test] fn test_apply_trace() { let mut trace = ApplyTrace::default(); - assert_eq!(0, trace.persisted_apply_index()); + assert_eq!(0, trace.admin.flushed); // If there is no modifications, index should be advanced anyway. trace.maybe_advance_admin_flushed(2); - assert_eq!(2, trace.persisted_apply_index()); + assert_eq!(2, trace.admin.flushed); for cf in DATA_CFS { trace.on_modify(cf, 3); } trace.maybe_advance_admin_flushed(3); // Modification is not flushed. - assert_eq!(2, trace.persisted_apply_index()); + assert_eq!(2, trace.admin.flushed); for cf in DATA_CFS { trace.on_flush(cf, 3); } trace.maybe_advance_admin_flushed(3); // No admin is recorded, index should be advanced. - assert_eq!(3, trace.persisted_apply_index()); + assert_eq!(3, trace.admin.flushed); trace.on_admin_modify(4); for cf in DATA_CFS { trace.on_flush(cf, 4); @@ -581,25 +634,25 @@ mod tests { } trace.maybe_advance_admin_flushed(4); // Unflushed admin modification should hold index. - assert_eq!(3, trace.persisted_apply_index()); + assert_eq!(3, trace.admin.flushed); trace.on_admin_flush(4); trace.maybe_advance_admin_flushed(4); // Admin is flushed, index should be advanced. - assert_eq!(4, trace.persisted_apply_index()); + assert_eq!(4, trace.admin.flushed); for cf in DATA_CFS { trace.on_flush(cf, 5); } trace.maybe_advance_admin_flushed(4); // Though all data CFs are flushed, but index should not be // advanced as we don't know whether there is admin modification. - assert_eq!(4, trace.persisted_apply_index()); + assert_eq!(4, trace.admin.flushed); for cf in DATA_CFS { trace.on_modify(cf, 5); } trace.maybe_advance_admin_flushed(5); // Because modify is recorded, so we know there should be no admin // modification and index can be advanced. - assert_eq!(5, trace.persisted_apply_index()); + assert_eq!(5, trace.admin.flushed); } #[test] @@ -620,6 +673,12 @@ mod tests { ([(8, 2), (9, 3), (7, 5)], (4, 4), 5, 5), ([(8, 2), (9, 3), (7, 5)], (5, 5), 5, 5), ([(2, 3), (9, 3), (7, 5)], (2, 2), 5, 2), + // In special cae, some CF may be flushed without any modification recorded, + // we should still able to advance the apply index forward. + ([(5, 2), (9, 3), (7, 3)], (2, 2), 3, 3), + ([(5, 2), (9, 3), (7, 3)], (2, 2), 6, 6), + ([(5, 2), (9, 3), (7, 3)], (2, 2), 10, 10), + ([(5, 2), (9, 3), (7, 3)], (2, 3), 10, 2), ]; for (case, (data_cfs, admin, mem_index, exp)) in cases.iter().enumerate() { let mut trace = ApplyTrace::default(); diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index a2707b6d411..96f1611d9f1 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -9,6 +9,7 @@ use raftstore::store::{ WriteSenders, WriteTask, }; use slog::{warn, Logger}; +use tikv_util::slog_panic; use crate::{ batch::{StoreContext, StoreRouter}, @@ -117,11 +118,11 @@ impl AsyncWriter { let last_unpersisted = self.unpersisted_readies.back(); if last_unpersisted.map_or(true, |u| u.number < ready_number) { - panic!( - "{:?} ready number is too large {:?} vs {}", - logger.list(), - last_unpersisted, - ready_number + slog_panic!( + logger, + "ready number is too large"; + "last_unpersisted" => ?last_unpersisted, + "ready_number" => ready_number ); } @@ -130,15 +131,15 @@ impl AsyncWriter { // There must be a match in `self.unpersisted_readies`. loop { let Some(v) = self.unpersisted_readies.pop_front() else { - panic!("{:?} ready number not found {}", logger.list(), ready_number); + slog_panic!(logger, "ready number not found"; "ready_number" => ready_number); }; has_snapshot |= v.has_snapshot; if v.number > ready_number { - panic!( - "{:?} ready number not matched {:?} vs {}", - logger.list(), - v, - ready_number + slog_panic!( + logger, + "ready number not matched"; + "ready" => ?v, + "ready_number" => ready_number ); } if raft_messages.is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 8b125844d0e..03dce74d4e7 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -30,10 +30,15 @@ use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, - store::{needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteTask}, + store::{ + needs_evict_entry_cache, util, worker_metrics::SNAP_COUNTER, FetchedLogs, ReadProgress, + Transport, WriteCallback, WriteTask, + }, }; -use slog::{debug, error, trace, warn}; +use slog::{debug, error, info, trace, warn}; use tikv_util::{ + log::SlogFormat, + slog_panic, store::find_peer, time::{duration_to_sec, monotonic_raw_now}, }; @@ -47,9 +52,12 @@ use crate::{ batch::StoreContext, fsm::{PeerFsmDelegate, Store}, raft::{Peer, Storage}, - router::{ApplyTask, PeerMsg, PeerTick}, + router::{PeerMsg, PeerTick}, + worker::tablet_gc, }; +const PAUSE_FOR_RECOVERY_GAP: u64 = 128; + impl Store { pub fn on_store_unreachable( &mut self, @@ -62,6 +70,19 @@ impl Store { ctx.router .broadcast_normal(|| PeerMsg::StoreUnreachable { to_store_id }); } + + #[cfg(feature = "testexport")] + pub fn on_wait_flush( + &mut self, + ctx: &mut StoreContext, + region_id: u64, + ch: crate::router::FlushChannel, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let _ = ctx.router.send(region_id, PeerMsg::WaitFlush(ch)); + } } impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { @@ -73,12 +94,64 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } self.schedule_tick(PeerTick::Raft); } + + pub fn on_check_long_uncommitted(&mut self) { + if !self.fsm.peer().is_leader() { + return; + } + self.fsm + .peer_mut() + .check_long_uncommitted_proposals(self.store_ctx); + self.schedule_tick(PeerTick::CheckLongUncommitted); + } } impl Peer { + pub fn maybe_pause_for_recovery(&mut self, store_ctx: &mut StoreContext) -> bool { + // The task needs to be scheduled even if the tablet may be replaced during + // recovery. Otherwise if there are merges during recovery, the FSM may + // be paused forever. + if self.storage().has_dirty_data() { + let region_id = self.region_id(); + let mailbox = store_ctx.router.mailbox(region_id).unwrap(); + let tablet_index = self.storage().tablet_index(); + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } + let entry_storage = self.storage().entry_storage(); + let committed_index = entry_storage.commit_index(); + let applied_index = entry_storage.applied_index(); + if committed_index > applied_index { + // Unlike v1, it's a must to set ready when there are pending entries. Otherwise + // it may block for ever when there is unapplied conf change. + self.set_has_ready(); + } + if committed_index > applied_index + PAUSE_FOR_RECOVERY_GAP { + // If there are too many the missing logs, we need to skip ticking otherwise + // it may block the raftstore thread for a long time in reading logs for + // election timeout. + info!(self.logger, "pause for recovery"; "applied" => applied_index, "committed" => committed_index); + self.set_pause_for_recovery(true); + true + } else { + false + } + } + #[inline] fn tick(&mut self) -> bool { - self.raft_group_mut().tick() + // When it's handling snapshot, it's pointless to tick as all the side + // affects have to wait till snapshot is applied. On the other hand, ticking + // will bring other corner cases like elections. + !self.is_handling_snapshot() && self.raft_group_mut().tick() } pub fn on_peer_unreachable(&mut self, to_peer_id: u64) { @@ -107,6 +180,10 @@ impl Peer { "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), ); + if self.pause_for_recovery() && msg.get_message().get_msg_type() == MessageType::MsgAppend { + ctx.raft_metrics.message_dropped.recovery.inc(); + return; + } if !self.serving() { return; } @@ -154,10 +231,14 @@ impl Peer { self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); } self.insert_peer_cache(msg.take_from_peer()); + let pre_committed_index = self.raft_group().raft.raft_log.committed; if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { error!(self.logger, "raft step error"; "err" => ?e); + } else { + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); } self.set_has_ready(); @@ -266,6 +347,56 @@ impl Peer { } } + /// Send a message. + /// + /// The message is pushed into the send buffer, it may not be sent out until + /// transport is flushed explicitly. + fn send_raft_message_on_leader( + &mut self, + ctx: &mut StoreContext, + msg: RaftMessage, + ) { + let message = msg.get_message(); + if message.get_msg_type() == MessageType::MsgAppend + && let Some(fe) = message.get_entries().first() + && let Some(le) = message.get_entries().last() + { + let last = (le.get_term(), le.get_index()); + let first = (fe.get_term(), fe.get_index()); + let now = Instant::now(); + let queue = self.proposals_mut().queue_mut(); + // Proposals are batched up, so it will liely hit after one or two steps. + for p in queue.iter_mut().rev() { + if p.sent { + break; + } + let cur = (p.term, p.index); + if cur > last { + continue; + } + if cur < first { + break; + } + for tracker in p.cb.write_trackers() { + tracker.observe(now, &ctx.raft_metrics.wf_send_proposal, |t| { + &mut t.metrics.wf_send_proposal_nanos + }); + } + p.sent = true; + } + } + if message.get_msg_type() == MessageType::MsgTimeoutNow { + // After a leader transfer procedure is triggered, the lease for + // the old leader may be expired earlier than usual, since a new leader + // may be elected and the old leader doesn't step down due to + // network partition from the new leader. + // For lease safety during leader transfer, transit `leader_lease` + // to suspect. + self.leader_lease_mut().suspect(monotonic_raw_now()); + } + self.send_raft_message(ctx, msg) + } + fn handle_raft_committed_entries( &mut self, ctx: &mut crate::batch::StoreContext, @@ -273,31 +404,47 @@ impl Peer { ) { // TODO: skip handling committed entries if a snapshot is being applied // asynchronously. - if self.is_leader() { + let mut update_lease = self.is_leader(); + if update_lease { for entry in committed_entries.iter().rev() { - self.update_approximate_raft_log_size(|s| s + entry.get_data().len() as u64); - let propose_time = self - .proposals() - .find_propose_time(entry.get_term(), entry.get_index()); - if let Some(propose_time) = propose_time { - // We must renew current_time because this value may be created a long time ago. - // If we do not renew it, this time may be smaller than propose_time of a - // command, which was proposed in another thread while this thread receives its - // AppendEntriesResponse and is ready to calculate its commit-log-duration. - ctx.current_time.replace(monotonic_raw_now()); - ctx.raft_metrics.commit_log.observe(duration_to_sec( - (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), - )); - self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); - break; + self.compact_log_context_mut() + .add_log_size(entry.get_data().len() as u64); + if update_lease { + let propose_time = self + .proposals() + .find_propose_time(entry.get_term(), entry.get_index()); + if let Some(propose_time) = propose_time { + // We must renew current_time because this value may be created a long time + // ago. If we do not renew it, this time may be + // smaller than propose_time of a command, which was + // proposed in another thread while this thread receives its + // AppendEntriesResponse and is ready to calculate its commit-log-duration. + let current_time = monotonic_raw_now(); + ctx.current_time.replace(current_time); + ctx.raft_metrics.commit_log.observe(duration_to_sec( + (current_time - propose_time).to_std().unwrap(), + )); + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); + update_lease = false; + } } } } + let applying_index = committed_entries.last().unwrap().index; + let commit_to_current_term = committed_entries.last().unwrap().term == self.term(); + self.compact_log_context_mut() + .set_last_applying_index(applying_index); if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { // Compact all cached entries instead of half evict. self.entry_storage_mut().evict_entry_cache(false); } - self.schedule_apply_committed_entries(committed_entries); + self.schedule_apply_committed_entries(ctx, committed_entries); + if self.is_leader() + && commit_to_current_term + && !self.proposal_control().has_uncommitted_admin() + { + self.raft_group_mut().skip_bcast_commit(true); + } } /// Processing the ready of raft. A detail description of how it's handled @@ -321,6 +468,7 @@ impl Peer { && !self.raft_group().has_ready() && (self.serving() || self.postponed_destroy()) { + self.maybe_schedule_gen_snapshot(); #[cfg(feature = "testexport")] self.async_writer.notify_flush(); return; @@ -343,8 +491,8 @@ impl Peer { let prev_commit_index = self.entry_storage().commit_index(); assert!( hs.get_commit() >= prev_commit_index, - "{:?} {:?} {}", - self.logger.list(), + "{} {:?} {}", + SlogFormat(&self.logger), hs, prev_commit_index ); @@ -357,7 +505,7 @@ impl Peer { debug_assert!(self.is_leader()); for msg in ready.take_messages() { if let Some(msg) = self.build_raft_message(msg) { - self.send_raft_message(ctx, msg); + self.send_raft_message_on_leader(ctx, msg); } } } @@ -367,18 +515,11 @@ impl Peer { self.handle_raft_committed_entries(ctx, ready.take_committed_entries()); } - // Check whether there is a pending generate snapshot task, the task - // needs to be sent to the apply system. - // Always sending snapshot task after apply task, so it gets latest - // snapshot. - if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { - self.apply_scheduler() - .unwrap() - .send(ApplyTask::Snapshot(gen_task)); - } + self.maybe_schedule_gen_snapshot(); let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); + self.report_send_to_queue_duration(ctx, &mut write_task, ready.entries()); let prev_persisted = self.storage().apply_trace().persisted_apply_index(); self.merge_state_changes_to(&mut write_task); self.storage_mut() @@ -393,12 +534,14 @@ impl Peer { .collect(); } if !self.serving() { - self.start_destroy(&mut write_task); - ctx.coprocessor_host.on_region_changed( - self.region(), - RegionChangeEvent::Destroy, - self.raft_group().raft.state, - ); + self.start_destroy(ctx, &mut write_task); + if self.persisted_index() != 0 { + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Destroy, + self.raft_group().raft.state, + ); + } } // Ready number should increase monotonically. assert!(self.async_writer.known_largest_number() < ready.number()); @@ -411,11 +554,11 @@ impl Peer { } } if !light_rd.messages().is_empty() || light_rd.commit_index().is_some() { - panic!( - "{:?} unexpected messages [{}] commit index [{:?}]", - self.logger.list(), - light_rd.messages().len(), - light_rd.commit_index() + slog_panic!( + self.logger, + "unexpected messages"; + "messages_count" => ?light_rd.messages().len(), + "commit_index" => ?light_rd.commit_index() ); } if !light_rd.committed_entries().is_empty() { @@ -453,8 +596,13 @@ impl Peer { } let persisted_number = self.async_writer.persisted_number(); + let pre_persisted_index = self.persisted_index(); + let pre_committed_index = self.raft_group().raft.raft_log.committed; self.raft_group_mut().on_persist_ready(persisted_number); let persisted_index = self.persisted_index(); + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_persist_log_duration(ctx, pre_persisted_index, persisted_index); + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); // The apply snapshot process order would be: // - Get the snapshot from the ready // - Wait for async writer to load this tablet @@ -477,6 +625,81 @@ impl Peer { } } + #[inline] + fn report_persist_log_duration( + &self, + ctx: &mut StoreContext, + from: u64, + to: u64, + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || from >= to { + return; + } + let now = Instant::now(); + for i in from + 1..to { + if let Some((term, trackers)) = self.proposals().find_trackers(i) { + if self.entry_storage().term(i).map_or(false, |t| t == term) { + for tracker in trackers { + tracker.observe(now, &ctx.raft_metrics.wf_persist_log, |t| { + &mut t.metrics.wf_persist_log_nanos + }); + } + } + } + } + } + + #[inline] + fn report_commit_log_duration(&self, ctx: &mut StoreContext, from: u64, to: u64) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || from >= to { + return; + } + let now = Instant::now(); + for i in from + 1..to { + if let Some((term, trackers)) = self.proposals().find_trackers(i) { + if self.entry_storage().term(i).map_or(false, |t| t == term) { + let commit_persisted = i <= self.persisted_index(); + let hist = if commit_persisted { + &ctx.raft_metrics.wf_commit_log + } else { + &ctx.raft_metrics.wf_commit_not_persist_log + }; + for tracker in trackers { + tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); + } + } + } + } + } + + #[inline] + fn report_send_to_queue_duration( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + entries: &[raft::eraftpb::Entry], + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() { + return; + } + let now = Instant::now(); + for entry in entries { + if let Some((term, trackers)) = self.proposals().find_trackers(entry.index) { + if entry.term == term { + for tracker in trackers { + write_task.trackers.push(*tracker); + tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { + &mut t.metrics.wf_send_to_queue_nanos + }); + } + } + } + } + } + #[cfg(feature = "testexport")] pub fn on_wait_flush(&mut self, ch: crate::router::FlushChannel) { self.async_writer.subscirbe_flush(ch); @@ -513,23 +736,26 @@ impl Peer { // latency. self.raft_group_mut().skip_bcast_commit(false); - // Init the in-memory pessimistic lock table when the peer becomes leader. - self.activate_in_memory_pessimistic_locks(); - - // A more recent read may happen on the old leader. So max ts should - // be updated after a peer becomes leader. - self.require_updating_max_ts(ctx); + self.txn_context().on_became_leader( + ctx, + self.term(), + self.region(), + &self.logger, + ); // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); self.region_heartbeat_pd(ctx); self.add_pending_tick(PeerTick::CompactLog); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::CheckLongUncommitted); } StateRole::Follower => { self.leader_lease_mut().expire(); self.storage_mut().cancel_generating_snap(None); - self.clear_in_memory_pessimistic_locks(); + self.txn_context() + .on_became_follower(self.term(), self.region()); } _ => {} } @@ -542,6 +768,7 @@ impl Peer { prev_lead_transferee: target, vote: self.raft_group().raft.vote, initialized: self.storage().is_initialized(), + peer_id: self.peer().get_id(), }, ); self.proposal_control_mut().maybe_update_term(term); @@ -568,9 +795,11 @@ impl Peer { // leader apply the split command or an election timeout is passed since split // is committed. We already forbid renewing lease after committing split, and // original leader will update the reader delegate with latest epoch after - // applying split before the split peer starts campaign, so here the only thing - // we need to do is marking split is committed (which is done by `commit_to` - // above). It's correct to allow local read during split. + // applying split before the split peer starts campaign, so what needs to be + // done are 1. mark split is committed, which is done by `commit_to` above, + // 2. make sure split result is invisible until epoch is updated or reader may + // miss data from the new tablet. This is done by always publish tablet in + // `on_apply_res_split`. So it's correct to allow local read during split. // // - For merge, after the prepare merge command is committed, the target peers // may apply commit merge at any time, so we need to forbid any type of read @@ -586,6 +815,56 @@ impl Peer { self.read_progress_mut().discard(); } } + + /// Check if there is long uncommitted proposal. + /// + /// This will increase the threshold when a long uncommitted proposal is + /// detected, and reset the threshold when there is no long uncommitted + /// proposal. + fn has_long_uncommitted_proposals(&mut self, ctx: &mut StoreContext) -> bool { + let mut has_long_uncommitted = false; + let base_threshold = ctx.cfg.long_uncommitted_base_threshold.0; + if let Some(propose_time) = self.proposals().oldest().and_then(|p| p.propose_time) { + // When a proposal was proposed with this ctx before, the current_time can be + // some. + let current_time = *ctx.current_time.get_or_insert_with(monotonic_raw_now); + let elapsed = match (current_time - propose_time).to_std() { + Ok(elapsed) => elapsed, + Err(_) => return false, + }; + // Increase the threshold for next turn when a long uncommitted proposal is + // detected. + let threshold = self.long_uncommitted_threshold(); + if elapsed >= threshold { + has_long_uncommitted = true; + self.set_long_uncommitted_threshold(threshold + base_threshold); + } else if elapsed < base_threshold { + self.set_long_uncommitted_threshold(base_threshold); + } + } else { + self.set_long_uncommitted_threshold(base_threshold); + } + has_long_uncommitted + } + + fn check_long_uncommitted_proposals(&mut self, ctx: &mut StoreContext) { + if self.has_long_uncommitted_proposals(ctx) { + let status = self.raft_group().status(); + let mut buffer: Vec<(u64, u64, u64)> = Vec::new(); + if let Some(prs) = status.progress { + for (id, p) in prs.iter() { + buffer.push((*id, p.commit_group_id, p.matched)); + } + } + warn!( + self.logger, + "found long uncommitted proposals"; + "progress" => ?buffer, + "cache_first_index" => ?self.entry_storage().entry_cache_first_index(), + "next_turn_threshold" => ?self.long_uncommitted_threshold(), + ); + } + } } impl Storage { @@ -607,6 +886,7 @@ impl Storage { ctx.snap_mgr.clone(), ctx.tablet_registry.clone(), ) { + SNAP_COUNTER.apply.fail.inc(); error!(self.logger(),"failed to apply snapshot";"error" => ?e) } } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 41dc0d39429..1fae813577c 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -36,17 +36,19 @@ use raft::{eraftpb::Snapshot, StateRole}; use raftstore::{ coprocessor::RegionChangeEvent, store::{ - metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, - TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, worker_metrics::SNAP_COUNTER, + GenSnapRes, ReadTask, TabletSnapKey, TabletSnapManager, Transport, WriteTask, + RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, }; -use slog::{error, info, warn}; -use tikv_util::box_err; +use slog::{debug, error, info, warn}; +use tikv_util::{box_err, log::SlogFormat, slog_panic}; use crate::{ fsm::ApplyResReporter, - operation::command::temp_split_path, + operation::{command::temp_split_path, SharedReadTablet}, raft::{Apply, Peer, Storage}, + router::ApplyTask, Result, StoreContext, }; @@ -160,6 +162,19 @@ pub fn install_tablet( } impl Peer { + /// Check whether there is a pending generate snapshot task, the task + /// needs to be sent to the apply system. + /// Always sending snapshot task after apply task, so it gets latest + /// snapshot. + #[inline] + pub fn maybe_schedule_gen_snapshot(&mut self) { + if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { + self.apply_scheduler() + .unwrap() + .send(ApplyTask::Snapshot(gen_task)); + } + } + pub fn on_snapshot_generated(&mut self, snapshot: GenSnapRes) { if self.storage_mut().on_snapshot_generated(snapshot) { self.raft_group_mut().ping(); @@ -197,39 +212,68 @@ impl Peer { StateRole::Follower, ); let persisted_index = self.persisted_index(); - let first_index = self.storage().entry_storage().first_index(); - if first_index == persisted_index + 1 { + self.compact_log_context_mut() + .set_last_applying_index(persisted_index); + let snapshot_index = self.entry_storage().truncated_index(); + assert!(snapshot_index >= RAFT_INIT_LOG_INDEX, "{:?}", self.logger); + // If leader sends a message append to the follower while it's applying + // snapshot (via split init for example), the persisted_index may be larger + // than the first index. But as long as first index is not larger, the + // latest snapshot should be applied. + if snapshot_index <= persisted_index { let region_id = self.region_id(); - self.reset_flush_state(); + self.reset_flush_state(snapshot_index); let flush_state = self.flush_state().clone(); - let mut tablet_ctx = TabletContext::new(self.region(), Some(persisted_index)); + let mut tablet_ctx = TabletContext::new(self.region(), Some(snapshot_index)); // Use a new FlushState to avoid conflicts with the old one. tablet_ctx.flush_state = Some(flush_state); - ctx.tablet_registry.load(tablet_ctx, false).unwrap(); - self.record_tablet_as_tombstone_and_refresh(persisted_index, ctx); - self.schedule_apply_fsm(ctx); + let path = ctx.tablet_registry.tablet_path(region_id, snapshot_index); + assert!( + path.exists(), + "{} {} not exists", + SlogFormat(&self.logger), + path.display() + ); + let tablet = ctx + .tablet_registry + .tablet_factory() + .open_tablet(tablet_ctx, &path) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "failed to load tablet"; + "path" => path.display(), + "error" => ?e + ); + }); + self.storage_mut().on_applied_snapshot(); - self.raft_group_mut().advance_apply_to(persisted_index); + self.raft_group_mut().advance_apply_to(snapshot_index); + let read_tablet = SharedReadTablet::new(tablet.clone()); { let mut meta = ctx.store_meta.lock().unwrap(); meta.set_region(self.region(), true, &self.logger); meta.readers - .insert(region_id, self.generate_read_delegate()); + .insert(region_id, (self.generate_read_delegate(), read_tablet)); meta.region_read_progress .insert(region_id, self.read_progress().clone()); } - self.read_progress_mut() - .update_applied_core(persisted_index); + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(ctx, tablet, snapshot_index); + } + self.read_progress_mut().update_applied_core(snapshot_index); let split = self.storage_mut().split_init_mut().take(); if split.as_ref().map_or(true, |s| { - !s.scheduled || persisted_index != RAFT_INIT_LOG_INDEX + !s.scheduled || snapshot_index != RAFT_INIT_LOG_INDEX }) { info!(self.logger, "apply tablet snapshot completely"); + SNAP_COUNTER.apply.success.inc(); } if let Some(init) = split { - info!(self.logger, "init with snapshot finished"); + info!(self.logger, "init split with snapshot finished"); self.post_split_init(ctx, init); } + self.schedule_apply_fsm(ctx); } } } @@ -240,6 +284,7 @@ impl Apply { /// Will schedule a task to read worker and then generate a snapshot /// asynchronously. pub fn schedule_gen_snapshot(&mut self, snap_task: GenSnapTask) { + debug!(self.logger, "scheduling snapshot"; "task" => ?snap_task); // Do not generate, the peer is removed. if self.tombstone() { snap_task.canceled.store(true, Ordering::SeqCst); @@ -315,12 +360,23 @@ impl Storage { }; } - info!( - self.logger(), - "requesting snapshot"; - "request_index" => request_index, - "request_peer" => to, - ); + if self.has_dirty_data() { + info!(self.logger(), "delay generating snapshot as there are still dirty data"; "request_index" => request_index, "request_peer" => to); + // It's OK to delay. If there are still dirty data, it means the tablet is just + // split. In normal cases, all peers will apply split, so reject generates + // snapshot may actually good for all peers as they are more likely + // to be initialized by split. + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } else { + info!( + self.logger(), + "requesting snapshot"; + "request_index" => request_index, + "request_peer" => to, + ); + } let canceled = Arc::new(AtomicBool::new(false)); let index = Arc::new(AtomicU64::new(0)); let mut gen_snap_task = self.gen_snap_task_mut(); @@ -343,6 +399,15 @@ impl Storage { /// Validate the snapshot. Returns true if it's valid. fn validate_snap(&self, snap: &Snapshot, request_index: u64) -> bool { let idx = snap.get_metadata().get_index(); + if idx < RAFT_INIT_LOG_INDEX || snap.get_metadata().get_term() < RAFT_INIT_LOG_TERM { + info!( + self.logger(), + "corrupted snapshot detected, generate again"; + "snap" => ?snap, + "request_index" => request_index, + ); + return false; + } // TODO(nolouch): check tuncated index if idx < request_index { // stale snapshot, should generate again. @@ -461,7 +526,7 @@ impl Storage { let index = entry.truncated_index(); entry.set_applied_term(term); entry.apply_state_mut().set_applied_index(index); - self.apply_trace_mut().reset_snapshot(index); + self.apply_trace_mut().on_applied_snapshot(index); } pub fn apply_snapshot( @@ -489,8 +554,35 @@ impl Storage { )); } + let old_last_index = self.entry_storage().last_index(); + if self.entry_storage().first_index() <= old_last_index { + // All states are rewritten in the following blocks. Stale states will be + // cleaned up by compact worker. Have to use raft write batch here becaue + // raft log engine expects deletes before writes. + let raft_engine = self.entry_storage().raft_engine(); + if task.raft_wb.is_none() { + task.raft_wb = Some(raft_engine.log_batch(64)); + } + let wb = task.raft_wb.as_mut().unwrap(); + raft_engine + .clean(region.get_id(), 0, self.entry_storage().raft_state(), wb) + .unwrap_or_else(|e| { + slog_panic!( + self.logger(), + "failed to clean up region"; + "error" => ?e + ) + }); + self.entry_storage_mut().clear(); + } + let last_index = snap.get_metadata().get_index(); let last_term = snap.get_metadata().get_term(); + assert!( + last_index >= RAFT_INIT_LOG_INDEX && last_term >= RAFT_INIT_LOG_TERM, + "{}", + SlogFormat(self.logger()) + ); let region_state = self.region_state_mut(); region_state.set_state(PeerState::Normal); region_state.set_region(region); @@ -501,7 +593,7 @@ impl Storage { entry_storage.set_truncated_term(last_term); entry_storage.set_last_term(last_term); - self.apply_trace_mut().reset_should_persist(); + self.apply_trace_mut().restore_snapshot(last_index); self.set_ever_persisted(); let lb = task .extra_write @@ -520,6 +612,8 @@ impl Storage { let (path, clean_split) = match self.split_init_mut() { // If index not match, the peer may accept a newer snapshot after split. Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => { + lb.put_dirty_mark(region_id, last_index, true).unwrap(); + self.set_has_dirty_data(true); (temp_split_path(®, region_id), false) } si => ( @@ -533,12 +627,11 @@ impl Storage { // it should load it into the factory after it persisted. let hook = move || { if !install_tablet(®, &path, region_id, last_index) { - panic!( - "{:?} failed to install tablet, path: {}, region_id: {}, tablet_index: {}", - logger.list(), - path.display(), - region_id, - last_index + slog_panic!( + logger, + "failed to install tablet"; + "path" => %path.display(), + "tablet_index" => last_index ); } if clean_split { diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs new file mode 100644 index 00000000000..911c1eaab78 --- /dev/null +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -0,0 +1,260 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains everything related to transaction hook. +//! +//! This is the temporary (efficient) solution, it should be implemented as one +//! type of coprocessor. + +use std::sync::{atomic::Ordering, Arc}; + +use crossbeam::atomic::AtomicCell; +use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; +use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::RaftRequestHeader}; +use parking_lot::RwLockWriteGuard; +use raft::eraftpb; +use raftstore::store::{ + LocksStatus, PeerPessimisticLocks, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, +}; +use slog::{error, info, Logger}; + +use crate::{ + batch::StoreContext, + raft::Peer, + router::{PeerMsg, PeerTick}, + worker::pd, + SimpleWriteEncoder, +}; + +pub struct TxnContext { + ext: Arc, + extra_op: Arc>, + reactivate_memory_lock_ticks: usize, +} + +impl Default for TxnContext { + #[inline] + fn default() -> Self { + Self { + ext: Arc::default(), + extra_op: Arc::new(AtomicCell::new(ExtraOp::Noop)), + reactivate_memory_lock_ticks: 0, + } + } +} + +impl TxnContext { + #[inline] + pub fn on_region_changed(&self, term: u64, region: &Region) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn on_became_leader( + &self, + ctx: &mut StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) { + // A more recent read may happen on the old leader. So max ts should + // be updated after a peer becomes leader. + self.require_updating_max_ts(ctx, term, region, logger); + + // Init the in-memory pessimistic lock table when the peer becomes leader. + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::Normal; + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn on_became_follower(&self, term: u64, region: &Region) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::NotLeader; + pessimistic_locks.clear(); + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn ext(&self) -> &Arc { + &self.ext + } + + #[inline] + pub fn extra_op(&self) -> &Arc> { + &self.extra_op + } + + // TODO: find a better place to put all txn related stuff. + fn require_updating_max_ts( + &self, + ctx: &StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let epoch = region.get_region_epoch(); + let term_low_bits = term & ((1 << 32) - 1); // 32 bits + let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits + let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); + self.ext + .max_ts_sync_status + .store(initial_status, Ordering::SeqCst); + info!( + logger, + "require updating max ts"; + "initial_status" => initial_status, + ); + let task = pd::Task::UpdateMaxTimestamp { + region_id: region.get_id(), + initial_status, + txn_ext: self.ext.clone(), + }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!(logger, "failed to notify pd with UpdateMaxTimestamp"; "err" => ?e); + } + } + + pub fn split(&self, regions: &[Region], derived: &Region) -> Vec { + // Group in-memory pessimistic locks in the original region into new regions. + // The locks of new regions will be put into the corresponding new regions + // later. And the locks belonging to the old region will stay in the original + // map. + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + // Update the version so the concurrent reader will fail due to EpochNotMatch + // instead of PessimisticLockNotFound. + pessimistic_locks.version = derived.get_region_epoch().get_version(); + pessimistic_locks.group_by_regions(regions, derived) + } + + pub fn init_with_lock(&self, locks: PeerPessimisticLocks) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + *pessimistic_locks = locks; + } +} + +impl Peer { + /// Returns True means the tick is consumed, otherwise the tick should be + /// rescheduled. + pub fn on_reactivate_memory_lock_tick(&mut self, ctx: &mut StoreContext) { + // If it is not leader, we needn't reactivate by tick. In-memory pessimistic + // lock will be enabled when this region becomes leader again. + if !self.is_leader() { + return; + } + + let transferring_leader = self.raft_group().raft.lead_transferee.is_some(); + let txn_context = self.txn_context_mut(); + let mut pessimistic_locks = txn_context.ext.pessimistic_locks.write(); + + // And this tick is currently only used for the leader transfer failure case. + if pessimistic_locks.status != LocksStatus::TransferringLeader { + return; + } + + txn_context.reactivate_memory_lock_ticks += 1; + // `lead_transferee` is not set immediately after the lock status changes. So, + // we need the tick count condition to avoid reactivating too early. + if !transferring_leader + && txn_context.reactivate_memory_lock_ticks >= ctx.cfg.reactive_memory_lock_timeout_tick + { + pessimistic_locks.status = LocksStatus::Normal; + txn_context.reactivate_memory_lock_ticks = 0; + } else { + drop(pessimistic_locks); + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + } + } + + // Returns whether we should propose another TransferLeader command. This is + // for: + // - Considering the amount of pessimistic locks can be big, it can reduce + // unavailable time caused by waiting for the transferee catching up logs. + // - Make transferring leader strictly after write commands that executes before + // proposing the locks, preventing unexpected lock loss. + pub fn propose_locks_before_transfer_leader( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + ) -> bool { + // 1. Disable in-memory pessimistic locks. + + // Clone to make borrow checker happy when registering ticks. + let txn_ext = self.txn_context().ext.clone(); + let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); + + // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message + // is a reply to a transfer leader command before. If the locks status remain + // in the TransferringLeader status, we can safely initiate transferring leader + // now. + // If it's not in TransferringLeader status now, it is probably because several + // ticks have passed after proposing the locks in the last time and we + // reactivate the memory locks. Then, we should propose the locks again. + if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX + && pessimistic_locks.status == LocksStatus::TransferringLeader + { + return false; + } + + // If it is not writable, it's probably because it's a retried TransferLeader + // and the locks have been proposed. But we still need to return true to + // propose another TransferLeader command. Otherwise, some write requests that + // have marked some locks as deleted will fail because raft rejects more + // proposals. + // It is OK to return true here if it's in other states like MergingRegion or + // NotLeader. In those cases, the locks will fail to propose and nothing will + // happen. + if !pessimistic_locks.is_writable() { + return true; + } + pessimistic_locks.status = LocksStatus::TransferringLeader; + self.txn_context_mut().reactivate_memory_lock_ticks = 0; + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + + // 2. Propose pessimistic locks + if pessimistic_locks.is_empty() { + return false; + } + // FIXME: Raft command has size limit. Either limit the total size of + // pessimistic locks in a region, or split commands here. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + let mut lock_count = 0; + { + // Downgrade to a read guard, do not block readers in the scheduler as far as + // possible. + let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); + fail::fail_point!("invalidate_locks_before_transfer_leader"); + for (key, (lock, deleted)) in &*pessimistic_locks { + if *deleted { + continue; + } + lock_count += 1; + encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); + } + } + if lock_count == 0 { + // If the map is not empty but all locks are deleted, it is possible that a + // write command has just marked locks deleted but not proposed yet. + // It might cause that command to fail if we skip proposing the + // extra TransferLeader command here. + return true; + } + let mut header = Box::::default(); + header.set_region_id(self.region_id()); + header.set_region_epoch(self.region().get_region_epoch().clone()); + header.set_peer(self.peer().clone()); + info!( + self.logger, + "propose {} locks before transferring leader", lock_count; + ); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; + self.on_simple_write(ctx, write.header, write.data, write.ch); + true + } +} diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 666f3adb699..7a1a22a5a95 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -2,26 +2,27 @@ use std::{mem, sync::Arc}; -use engine_traits::{CachedTablet, FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; +use engine_traits::{ + FlushState, KvEngine, PerfContextKind, TabletRegistry, WriteBatch, DATA_CFS_LEN, +}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, - ReadTask, + Config, ReadTask, }; use slog::Logger; -use tikv_util::worker::Scheduler; +use tikv_util::{log::SlogFormat, worker::Scheduler}; use crate::{ - operation::{AdminCmdResult, DataTrace}, + operation::{AdminCmdResult, ApplyFlowControl, DataTrace}, router::CmdResChannel, }; /// Apply applies all the committed commands to kv db. pub struct Apply { peer: metapb::Peer, - /// publish the update of the tablet - remote_tablet: CachedTablet, tablet: EK, + perf_context: EK::PerfContext, pub write_batch: Option, /// A buffer for encoding key. pub key_buffer: Vec, @@ -30,10 +31,15 @@ pub struct Apply { callbacks: Vec<(Vec, RaftCmdResponse)>, + flow_control: ApplyFlowControl, + /// A flag indicates whether the peer is destroyed by applying admin /// command. tombstone: bool, applied_term: u64, + // Apply progress is set after every command in case there is a flush. But it's + // wrong to update flush_state immediately as a manual flush from other thread + // can fetch the wrong apply index from flush_state. applied_index: u64, /// The largest index that have modified each column family. modifications: DataTrace, @@ -57,6 +63,7 @@ pub struct Apply { impl Apply { #[inline] pub fn new( + cfg: &Config, peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, @@ -64,19 +71,26 @@ impl Apply { read_scheduler: Scheduler>, flush_state: Arc, log_recovery: Option>, + applied_term: u64, logger: Logger, ) -> Self { let mut remote_tablet = tablet_registry .get(region_state.get_region().get_id()) .unwrap(); + assert_ne!(applied_term, 0, "{}", SlogFormat(&logger)); + let applied_index = flush_state.applied_index(); + assert_ne!(applied_index, 0, "{}", SlogFormat(&logger)); + let tablet = remote_tablet.latest().unwrap().clone(); + let perf_context = EK::get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply); Apply { peer, - tablet: remote_tablet.latest().unwrap().clone(), - remote_tablet, + tablet, + perf_context, write_batch: None, callbacks: vec![], + flow_control: ApplyFlowControl::new(cfg), tombstone: false, - applied_term: 0, + applied_term, applied_index: flush_state.applied_index(), modifications: [0; DATA_CFS_LEN], admin_cmd_result: vec![], @@ -125,9 +139,6 @@ impl Apply { let log_recovery = self.log_recovery.as_ref().unwrap(); if log_recovery.iter().all(|v| index >= *v) { self.log_recovery.take(); - // Now all logs are recovered, flush them to avoid recover again - // and again. - let _ = self.tablet.flush_cfs(&[], false); } } @@ -151,13 +162,16 @@ impl Apply { &mut self.region_state } - /// Publish the tablet so that it can be used by read worker. - /// - /// Note, during split/merge, lease is expired explicitly and read is - /// forbidden. So publishing it immediately is OK. + /// The tablet can't be public yet, otherwise content of latest tablet + /// doesn't matches its epoch in both readers and peer fsm. #[inline] - pub fn publish_tablet(&mut self, tablet: EK) { - self.remote_tablet.set(tablet.clone()); + pub fn set_tablet(&mut self, tablet: EK) { + assert!( + self.write_batch.as_ref().map_or(true, |wb| wb.is_empty()), + "{} setting tablet while still have dirty write batch", + SlogFormat(&self.logger) + ); + self.write_batch.take(); self.tablet = tablet; } @@ -166,6 +180,11 @@ impl Apply { &self.tablet } + #[inline] + pub fn perf_context(&mut self) -> &mut EK::PerfContext { + &mut self.perf_context + } + #[inline] pub fn peer(&self) -> &metapb::Peer { &self.peer @@ -218,4 +237,13 @@ impl Apply { pub fn log_recovery(&self) -> &Option> { &self.log_recovery } + + #[inline] + pub fn apply_flow_control_mut(&mut self) -> &mut ApplyFlowControl { + &mut self.flow_control + } + + pub fn apply_flow_control(&self) -> &ApplyFlowControl { + &self.flow_control + } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 668b0ebf41d..6cfcda4da25 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -1,17 +1,16 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - mem, - sync::{atomic::Ordering, Arc}, + cmp, mem, + sync::Arc, time::{Duration, Instant}, }; use collections::{HashMap, HashSet}; -use crossbeam::atomic::AtomicCell; use engine_traits::{ CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, }; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; +use kvproto::{metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::{ @@ -19,21 +18,20 @@ use raftstore::{ store::{ fsm::ApplyMetrics, util::{Lease, RegionReadProgress}, - Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, - ReadProgress, TabletSnapManager, TxnExt, WriteTask, + Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, + TabletSnapManager, WriteTask, }, }; use slog::Logger; use super::storage::Storage; use crate::{ - batch::StoreContext, fsm::ApplyScheduler, operation::{ - AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, + AsyncWriter, CompactLogContext, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, + SplitFlowControl, TxnContext, }, router::{CmdResChannel, PeerTick, QueryResChannel}, - worker::tablet_gc, Result, }; @@ -43,11 +41,6 @@ const REGION_READ_PROGRESS_CAP: usize = 128; pub struct Peer { raft_group: RawNode>, tablet: CachedTablet, - /// Tombstone tablets can only be destroyed when the tablet that replaces it - /// is persisted. This is a list of tablet index that awaits to be - /// persisted. When persisted_apply is advanced, we need to notify tablet_gc - /// worker to destroy them. - pending_tombstone_tablets: Vec, /// Statistics for self. self_stat: PeerStat, @@ -60,8 +53,7 @@ pub struct Peer { peer_heartbeats: HashMap, /// For raft log compaction. - skip_compact_log_ticks: usize, - approximate_raft_log_size: u64, + compact_log_context: CompactLogContext, /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. @@ -73,6 +65,7 @@ pub struct Peer { has_ready: bool, /// Sometimes there is no ready at all, but we need to trigger async write. has_extra_write: bool, + pause_for_recovery: bool, /// Writer for persisting side effects asynchronously. pub(crate) async_writer: AsyncWriter, @@ -88,8 +81,7 @@ pub struct Peer { last_region_buckets: Option, /// Transaction extensions related to this peer. - txn_ext: Arc, - txn_extra_op: Arc>, + txn_context: TxnContext, pending_ticks: Vec, @@ -109,6 +101,8 @@ pub struct Peer { /// lead_transferee if this peer(leader) is in a leadership transferring. leader_transferee: u64, + + long_uncommitted_threshold: u64, } impl Peer { @@ -133,7 +127,7 @@ impl Peer { let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); - let flush_state: Arc = Arc::default(); + let flush_state: Arc = Arc::new(FlushState::new(applied_index)); // We can't create tablet if tablet index is 0. It can introduce race when gc // old tablet and create new peer. We also can't get the correct range of the // region, which is required for kv data gc. @@ -149,18 +143,17 @@ impl Peer { let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { tablet: cached_tablet, - pending_tombstone_tablets: Vec::new(), self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), - skip_compact_log_ticks: 0, - approximate_raft_log_size: 0, + compact_log_context: CompactLogContext::new(applied_index), raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), apply_scheduler: None, has_ready: false, has_extra_write: false, + pause_for_recovery: false, destroy_progress: DestroyProgress::None, raft_group, logger, @@ -177,8 +170,7 @@ impl Peer { ), region_buckets: None, last_region_buckets: None, - txn_ext: Arc::default(), - txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), + txn_context: TxnContext::default(), proposal_control: ProposalControl::new(0), pending_ticks: Vec::new(), split_trace: vec![], @@ -186,6 +178,10 @@ impl Peer { flush_state, split_flow_control: SplitFlowControl::default(), leader_transferee: raft::INVALID_ID, + long_uncommitted_threshold: cmp::max( + cfg.long_uncommitted_base_threshold.0.as_secs(), + 1, + ), }; // If this region has only one peer and I am the one, campaign directly. @@ -261,11 +257,8 @@ impl Peer { self.read_progress .update_leader_info(self.leader_id(), self.term(), self.region()); - { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } + self.txn_context + .on_region_changed(self.term(), self.region()); if self.serving() { host.on_region_changed( @@ -342,38 +335,18 @@ impl Peer { } #[inline] - pub fn record_tablet_as_tombstone_and_refresh( - &mut self, - new_tablet_index: u64, - ctx: &StoreContext, - ) { - if let Some(old_tablet) = self.tablet.cache() { - self.pending_tombstone_tablets.push(new_tablet_index); - let _ = ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::prepare_destroy( - old_tablet.clone(), - self.region_id(), - new_tablet_index, - )); - } - // TODO: Handle race between split and snapshot. So that we can assert - // `self.tablet.refresh() == 1` - assert!(self.tablet.refresh() > 0); + pub fn set_tablet(&mut self, tablet: EK) -> Option { + self.tablet.set(tablet) } - /// Returns if there's any tombstone being removed. #[inline] - pub fn remove_tombstone_tablets_before(&mut self, persisted: u64) -> bool { - let mut removed = 0; - while let Some(i) = self.pending_tombstone_tablets.first() - && *i <= persisted - { - removed += 1; - } - self.pending_tombstone_tablets.drain(..removed); - removed > 0 + pub fn compact_log_context_mut(&mut self) -> &mut CompactLogContext { + &mut self.compact_log_context + } + + #[inline] + pub fn compact_log_context(&self) -> &CompactLogContext { + &self.compact_log_context } #[inline] @@ -431,6 +404,16 @@ impl Peer { mem::take(&mut self.has_extra_write) } + #[inline] + pub fn set_pause_for_recovery(&mut self, pause: bool) { + self.pause_for_recovery = pause; + } + + #[inline] + pub fn pause_for_recovery(&self) -> bool { + self.pause_for_recovery + } + #[inline] pub fn insert_peer_cache(&mut self, peer: metapb::Peer) { for p in self.raft_group.store().region().get_peers() { @@ -526,31 +509,6 @@ impl Peer { down_peers } - #[inline] - pub fn reset_skip_compact_log_ticks(&mut self) { - self.skip_compact_log_ticks = 0; - } - - #[inline] - pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { - if self.skip_compact_log_ticks < max_skip_ticks { - self.skip_compact_log_ticks += 1; - true - } else { - false - } - } - - #[inline] - pub fn approximate_raft_log_size(&self) -> u64 { - self.approximate_raft_log_size - } - - #[inline] - pub fn update_approximate_raft_log_size(&mut self, f: impl Fn(u64) -> u64) { - self.approximate_raft_log_size = f(self.approximate_raft_log_size); - } - #[inline] pub fn state_role(&self) -> StateRole { self.raft_group.raft.state @@ -654,8 +612,7 @@ impl Peer { /// See the comments of `check_snap_status` for more details. #[inline] pub fn is_handling_snapshot(&self) -> bool { - // todo: This method may be unnecessary now? - false + self.persisted_index() < self.entry_storage().truncated_index() } /// Returns `true` if the raft group has replicated a snapshot but not @@ -675,21 +632,6 @@ impl Peer { mem::take(&mut self.pending_ticks) } - pub fn activate_in_memory_pessimistic_locks(&mut self) { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.status = LocksStatus::Normal; - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } - - pub fn clear_in_memory_pessimistic_locks(&mut self) { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.status = LocksStatus::NotLeader; - pessimistic_locks.clear(); - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } - #[inline] pub fn post_split(&mut self) { self.reset_region_buckets(); @@ -714,8 +656,13 @@ impl Peer { } #[inline] - pub fn txn_ext(&self) -> &Arc { - &self.txn_ext + pub fn txn_context(&self) -> &TxnContext { + &self.txn_context + } + + #[inline] + pub fn txn_context_mut(&mut self) -> &mut TxnContext { + &mut self.txn_context } pub fn generate_read_delegate(&self) -> ReadDelegate { @@ -726,8 +673,8 @@ impl Peer { self.term(), self.region().clone(), self.storage().entry_storage().applied_term(), - self.txn_extra_op.clone(), - self.txn_ext.clone(), + self.txn_context.extra_op().clone(), + self.txn_context.ext().clone(), self.read_progress().clone(), self.region_buckets.as_ref().map(|b| b.meta.clone()), ) @@ -751,19 +698,6 @@ impl Peer { .advance_apply(apply_index, term, region); } - // TODO: find a better place to put all txn related stuff. - pub fn require_updating_max_ts(&self, ctx: &StoreContext) { - let epoch = self.region().get_region_epoch(); - let term_low_bits = self.term() & ((1 << 32) - 1); // 32 bits - let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits - let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); - self.txn_ext - .max_ts_sync_status - .store(initial_status, Ordering::SeqCst); - - self.update_max_timestamp_pd(ctx, initial_status); - } - #[inline] pub fn split_trace_mut(&mut self) -> &mut Vec<(u64, HashSet)> { &mut self.split_trace @@ -774,8 +708,8 @@ impl Peer { &self.flush_state } - pub fn reset_flush_state(&mut self) { - self.flush_state = Arc::default(); + pub fn reset_flush_state(&mut self, index: u64) { + self.flush_state = Arc::new(FlushState::new(index)); } // Note: Call `set_has_extra_write` after adding new state changes. @@ -811,4 +745,14 @@ impl Peer { .unwrap_or(raft::INVALID_ID), ) } + + #[inline] + pub fn long_uncommitted_threshold(&self) -> Duration { + Duration::from_secs(self.long_uncommitted_threshold) + } + + #[inline] + pub fn set_long_uncommitted_threshold(&mut self, dur: Duration) { + self.long_uncommitted_threshold = cmp::max(dur.as_secs(), 1); + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 636970c0ad1..ce15ac20621 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -9,7 +9,7 @@ use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ metapb, - raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, }; use raft::{ eraftpb::{ConfState, Entry, Snapshot}, @@ -35,6 +35,9 @@ pub struct Storage { /// by messages, it has not persisted any states, we need to persist them /// at least once dispite whether the state changes since create. ever_persisted: bool, + /// It may have dirty data after split. Use a flag to indicate whether it + /// has finished clean up. + has_dirty_data: bool, logger: Logger, /// Snapshot part. @@ -116,6 +119,16 @@ impl Storage { pub fn apply_trace(&self) -> &ApplyTrace { &self.apply_trace } + + #[inline] + pub fn set_has_dirty_data(&mut self, has_dirty_data: bool) { + self.has_dirty_data = has_dirty_data; + } + + #[inline] + pub fn has_dirty_data(&self) -> bool { + self.has_dirty_data + } } impl Storage { @@ -139,6 +152,17 @@ impl Storage { }; let region = region_state.get_region(); let logger = logger.new(o!("region_id" => region.id, "peer_id" => peer.get_id())); + let has_dirty_data = + match engine.get_dirty_mark(region.get_id(), region_state.get_tablet_index()) { + Ok(b) => b, + Err(e) => { + return Err(box_err!( + "failed to get dirty mark for {}: {:?}", + region.get_id(), + e + )); + } + }; let entry_storage = EntryStorage::new( peer.get_id(), engine, @@ -153,6 +177,7 @@ impl Storage { peer: peer.clone(), region_state, ever_persisted: persisted, + has_dirty_data, logger, snap_states: RefCell::new(HashMap::default()), gen_snap_task: RefCell::new(Box::new(None)), @@ -209,10 +234,7 @@ impl Storage { #[inline] pub fn tablet_index(&self) -> u64 { - match self.region_state.get_state() { - PeerState::Tombstone | PeerState::Applying => 0, - _ => self.region_state.get_tablet_index(), - } + self.region_state.get_tablet_index() } #[inline] @@ -298,15 +320,18 @@ mod tests { ctor::{CfOptions, DbOptions}, kv::TestTabletFactory, }; - use engine_traits::{RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS}; + use engine_traits::{ + FlushState, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS, + }; use kvproto::{ metapb::{Peer, Region}, raft_serverpb::PeerState, }; use raft::{Error as RaftError, StorageError}; use raftstore::store::{ - util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, - TabletSnapKey, TabletSnapManager, WriteTask, + util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, Config, FetchedLogs, + GenSnapRes, ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, + RAFT_INIT_LOG_TERM, }; use slog::o; use tempfile::TempDir; @@ -355,14 +380,20 @@ mod tests { region } + fn new_entry(index: u64, term: u64) -> Entry { + let mut e = Entry::default(); + e.set_index(index); + e.set_term(term); + e + } + #[test] fn test_apply_snapshot() { let region = new_region(); let path = TempDir::new().unwrap(); let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); - let raft_engine = - engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) - .unwrap(); + let engines = engine_test::new_temp_engine(&path); + let raft_engine = engines.raft.clone(); let mut wb = raft_engine.log_batch(10); write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); @@ -379,26 +410,57 @@ mod tests { .unwrap() .unwrap(); - let snapshot = new_empty_snapshot(region.clone(), 10, 1, false); - let mut task = WriteTask::new(region.get_id(), 5, 0); - s.apply_snapshot(&snapshot, &mut task, mgr, reg).unwrap(); + let mut task = WriteTask::new(region.get_id(), 5, 1); + let entries = (RAFT_INIT_LOG_INDEX + 1..RAFT_INIT_LOG_INDEX + 10) + .map(|i| new_entry(i, RAFT_INIT_LOG_TERM)) + .collect(); + s.entry_storage_mut().append(entries, &mut task); + write_to_db_for_test(&engines, task); + + let snap_index = RAFT_INIT_LOG_INDEX + 20; + let snap_term = 9; + let path = mgr.final_recv_path(&TabletSnapKey::new( + region.get_id(), + 5, + snap_term, + snap_index, + )); + reg.tablet_factory() + .open_tablet(TabletContext::new(®ion, Some(snap_index)), &path) + .unwrap(); + let snapshot = new_empty_snapshot(region.clone(), snap_index, snap_term, false); + let mut task = WriteTask::new(region.get_id(), 5, 1); + s.apply_snapshot(&snapshot, &mut task, mgr, reg.clone()) + .unwrap(); + // Add more entries to check if old entries are cleared. If not, it should panic + // with memtable hole when using raft engine. + let entries = (snap_index + 1..=snap_index + 10) + .map(|i| new_entry(i, snap_term)) + .collect(); + s.entry_storage_mut().append(entries, &mut task); + + assert!(!reg.tablet_path(region.get_id(), snap_index).exists()); + assert!(!task.persisted_cbs.is_empty()); + + write_to_db_for_test(&engines, task); + + assert!(reg.tablet_path(region.get_id(), snap_index).exists()); // It can be set before load tablet. assert_eq!(PeerState::Normal, s.region_state().get_state()); - assert_eq!(10, s.entry_storage().truncated_index()); - assert_eq!(1, s.entry_storage().truncated_term()); - assert_eq!(1, s.entry_storage().last_term()); - assert_eq!(10, s.entry_storage().raft_state().last_index); + assert_eq!(snap_index, s.entry_storage().truncated_index()); + assert_eq!(snap_term, s.entry_storage().truncated_term()); + assert_eq!(snap_term, s.entry_storage().last_term()); + assert_eq!(snap_index + 10, s.entry_storage().raft_state().last_index); // This index can't be set before load tablet. - assert_ne!(10, s.entry_storage().applied_index()); - assert_ne!(1, s.entry_storage().applied_term()); - assert_eq!(10, s.region_state().get_tablet_index()); - assert!(!task.persisted_cbs.is_empty()); + assert_ne!(snap_index, s.entry_storage().applied_index()); + assert_ne!(snap_term, s.entry_storage().applied_term()); + assert_eq!(snap_index, s.region_state().get_tablet_index()); s.on_applied_snapshot(); - assert_eq!(10, s.entry_storage().applied_index()); - assert_eq!(1, s.entry_storage().applied_term()); - assert_eq!(10, s.region_state().get_tablet_index()); + assert_eq!(snap_index, s.entry_storage().applied_index()); + assert_eq!(snap_term, s.entry_storage().applied_term()); + assert_eq!(snap_index, s.region_state().get_tablet_index()); } #[test] @@ -435,13 +497,15 @@ mod tests { state.set_region(region.clone()); // setup peer applyer let mut apply = Apply::new( + &Config::default(), region.get_peers()[0].clone(), state, router, reg, sched, - Arc::default(), + Arc::new(FlushState::new(5)), None, + 5, logger, ); @@ -460,8 +524,8 @@ mod tests { SnapState::Generated(ref snap) => *snap.clone(), ref s => panic!("unexpected state: {:?}", s), }; - assert_eq!(snap.get_metadata().get_index(), 0); - assert_eq!(snap.get_metadata().get_term(), 0); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); assert_eq!(snap.get_data().is_empty(), false); let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); let checkpointer_path = mgr.tablet_gen_path(&snap_key); diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 668d7591a40..315f8a0d8eb 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -6,7 +6,7 @@ use std::{ }; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine}; use futures::Future; use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, @@ -33,12 +33,12 @@ impl AsyncReadNotifier for StoreRouter { } impl raftstore::coprocessor::StoreHandle for StoreRouter { - fn update_approximate_size(&self, _region_id: u64, _size: u64) { - // TODO + fn update_approximate_size(&self, region_id: u64, size: u64) { + let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); } - fn update_approximate_keys(&self, _region_id: u64, _keys: u64) { - // TODO + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); } fn ask_split( @@ -115,13 +115,13 @@ where } impl RaftRouter { - pub fn new(store_id: u64, reg: TabletRegistry, router: StoreRouter) -> Self { + pub fn new(store_id: u64, router: StoreRouter) -> Self { let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); let logger = router.logger().clone(); RaftRouter { router: router.clone(), - local_reader: LocalReader::new(store_meta, reg, router, logger), + local_reader: LocalReader::new(store_meta, router, logger), } } @@ -138,7 +138,7 @@ impl RaftRouter { self.router.check_send(addr, msg) } - pub fn store_meta(&self) -> &Arc> { + pub fn store_meta(&self) -> &Arc>> { self.local_reader.store_meta() } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 05e1baea1cf..092e7e21b5f 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -10,6 +10,7 @@ pub enum ApplyTask { Snapshot(GenSnapTask), /// Writes that doesn't care consistency. UnsafeWrite(Box<[u8]>), + ManualFlush, } #[derive(Debug, Default)] diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 930de5ff036..8814a97cc5f 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -2,6 +2,7 @@ // #[PerformanceCriticalPath] +use batch_system::ResourceMetered; use kvproto::{ metapb, raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, @@ -181,12 +182,24 @@ pub enum PeerMsg { request: RequestSplit, ch: CmdResChannel, }, + UpdateRegionSize { + size: u64, + }, + UpdateRegionKeys { + keys: u64, + }, + ClearRegionSize, ForceCompactLog, + TabletTrimmed { + tablet_index: u64, + }, /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), } +impl ResourceMetered for PeerMsg {} + impl PeerMsg { pub fn raft_query(req: RaftCmdRequest) -> (Self, QueryResSubscriber) { let (ch, sub) = QueryResChannel::pair(); @@ -247,5 +260,15 @@ pub enum StoreMsg { SplitInit(Box), Tick(StoreTick), Start, - StoreUnreachable { to_store_id: u64 }, + StoreUnreachable { + to_store_id: u64, + }, + /// A message that used to check if a flush is happened. + #[cfg(feature = "testexport")] + WaitFlush { + region_id: u64, + ch: super::FlushChannel, + }, } + +impl ResourceMetered for StoreMsg {} diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index 2cb75acccfc..f70b6635982 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -30,8 +30,12 @@ use raftstore::store::{ local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, WriteCallback, }; -use smallvec::SmallVec; -use tracker::TrackerToken; +use tracker::{get_tls_tracker_token, TrackerToken}; + +union Tracker { + read: TrackerToken, + write: TimeTracker, +} /// A struct allows to watch and notify specific events. /// @@ -54,6 +58,7 @@ struct EventCore { before_set: UnsafeCell>>, // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. waker: AtomicWaker, + tracker: UnsafeCell, } unsafe impl Send for EventCore {} @@ -240,20 +245,24 @@ pub struct BaseChannel { core: Arc>, } +#[inline] +fn pair() -> (BaseChannel, BaseSubscriber) { + let tracker = Tracker { + read: get_tls_tracker_token(), + }; + BaseChannel::::with_mask(u32::MAX, tracker) +} + impl BaseChannel { - /// Creates a pair of channel and subscriber. #[inline] - pub fn pair() -> (Self, BaseSubscriber) { - Self::with_mask(u32::MAX) - } - - fn with_mask(mask: u32) -> (Self, BaseSubscriber) { + fn with_mask(mask: u32, tracker: Tracker) -> (Self, BaseSubscriber) { let core: Arc> = Arc::new(EventCore { event: AtomicU64::new(0), res: UnsafeCell::new(None), event_mask: mask, before_set: UnsafeCell::new(None), waker: AtomicWaker::new(), + tracker: UnsafeCell::new(tracker), }); (Self { core: core.clone() }, BaseSubscriber { core }) } @@ -449,7 +458,10 @@ impl CmdResChannelBuilder { #[inline] pub fn build(self) -> (CmdResChannel, CmdResSubscriber) { - let (c, s) = CmdResChannel::with_mask(self.event_mask); + let tracker = Tracker { + write: TimeTracker::default(), + }; + let (c, s) = CmdResChannel::with_mask(self.event_mask, tracker); if let Some(f) = self.before_set { unsafe { *c.core.before_set.get() = Some(f); @@ -463,6 +475,15 @@ impl CmdResChannel { // Valid range is [1, 30] const PROPOSED_EVENT: u64 = 1; const COMMITTED_EVENT: u64 = 2; + + /// Creates a pair of channel and subscriber. + #[inline] + pub fn pair() -> (Self, CmdResSubscriber) { + let tracker = Tracker { + write: TimeTracker::default(), + }; + Self::with_mask(u32::MAX, tracker) + } } impl ErrorCallback for CmdResChannel { @@ -493,12 +514,15 @@ impl WriteCallback for CmdResChannel { self.core.notify_event(Self::COMMITTED_EVENT); } - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - None + type TimeTrackerListRef<'a> = &'a [TimeTracker]; + #[inline] + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + std::slice::from_ref(unsafe { &(*self.core.tracker.get()).write }) } - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - None + type TimeTrackerListMut<'a> = &'a mut [TimeTracker]; + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + std::slice::from_mut(unsafe { &mut (*self.core.tracker.get()).write }) } // TODO: support executing hooks inside setting result. @@ -556,6 +580,13 @@ impl QueryResult { pub type QueryResChannel = BaseChannel; +impl QueryResChannel { + #[inline] + pub fn pair() -> (Self, QueryResSubscriber) { + pair() + } +} + impl ErrorCallback for QueryResChannel { #[inline] fn report_error(self, err: RaftCmdResponse) { @@ -576,8 +607,8 @@ impl ReadCallback for QueryResChannel { self.set_result(res); } - fn read_tracker(&self) -> Option<&TrackerToken> { - None + fn read_tracker(&self) -> Option { + Some(unsafe { (*self.core.tracker.get()).read }) } } @@ -592,6 +623,13 @@ impl fmt::Debug for QueryResChannel { pub type DebugInfoChannel = BaseChannel; pub type DebugInfoSubscriber = BaseSubscriber; +impl DebugInfoChannel { + #[inline] + pub fn pair() -> (Self, DebugInfoSubscriber) { + pair() + } +} + impl Debug for DebugInfoChannel { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "DebugInfoChannel") @@ -599,17 +637,29 @@ impl Debug for DebugInfoChannel { } #[cfg(feature = "testexport")] -pub type FlushChannel = BaseChannel<()>; -#[cfg(feature = "testexport")] -pub type FlushSubscriber = BaseSubscriber<()>; +mod flush_channel { + use super::*; -#[cfg(feature = "testexport")] -impl Debug for FlushChannel { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "FlushChannel") + pub type FlushChannel = BaseChannel<()>; + pub type FlushSubscriber = BaseSubscriber<()>; + + impl FlushChannel { + #[inline] + pub fn pair() -> (Self, FlushSubscriber) { + pair() + } + } + + impl Debug for FlushChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "FlushChannel") + } } } +#[cfg(feature = "testexport")] +pub use flush_channel::{FlushChannel, FlushSubscriber}; + #[cfg(test)] mod tests { use std::assert_matches::assert_matches; diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/misc.rs similarity index 89% rename from components/raftstore-v2/src/worker/pd/update_max_timestamp.rs rename to components/raftstore-v2/src/worker/pd/misc.rs index 0de3fb9a87c..68c624b089a 100644 --- a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs +++ b/components/raftstore-v2/src/worker/pd/misc.rs @@ -93,13 +93,10 @@ where } }; - #[cfg(feature = "failpoints")] let delay = (|| { fail::fail_point!("delay_update_max_ts", |_| true); false })(); - #[cfg(not(feature = "failpoints"))] - let delay = false; if delay { info!(self.logger, "[failpoint] delay update max ts for 1s"; "region_id" => region_id); @@ -110,4 +107,17 @@ where self.remote.spawn(f); } } + + pub fn handle_report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) { + let resp = self + .pd_client + .report_min_resolved_ts(store_id, min_resolved_ts); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + warn!(logger, "report min resolved_ts failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } } diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index cc977e68236..b23d1500914 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -10,10 +10,16 @@ use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; -use pd_client::PdClient; -use raftstore::store::{util::KeysInfoFormatter, FlowStatsReporter, ReadStats, TxnExt, WriteStats}; -use slog::{error, info, Logger}; +use pd_client::{BucketStat, PdClient}; +use raftstore::store::{ + util::KeysInfoFormatter, AutoSplitController, Config, FlowStatsReporter, PdStatsMonitor, + ReadStats, RegionReadProgressRegistry, SplitInfo, StoreStatsReporter, TabletSnapManager, + TxnExt, WriteStats, NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, +}; +use resource_metering::{Collector, CollectorRegHandle, RawRecords}; +use slog::{error, Logger}; use tikv_util::{ + config::VersionTrack, time::UnixSecs, worker::{Runnable, Scheduler}, }; @@ -24,22 +30,36 @@ use crate::{ router::{CmdResChannel, PeerMsg}, }; -mod region_heartbeat; +mod misc; +mod region; mod split; -mod store_heartbeat; -mod update_max_timestamp; +mod store; + +pub use region::RegionHeartbeatTask; -pub use region_heartbeat::RegionHeartbeatTask; +type RecordPairVec = Vec; pub enum Task { - RegionHeartbeat(RegionHeartbeatTask), + // In store.rs. StoreHeartbeat { stats: pdpb::StoreStats, // TODO: StoreReport, StoreDrAutoSyncStatus }, + UpdateStoreInfos { + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + }, + // In region.rs. + RegionHeartbeat(RegionHeartbeatTask), + ReportRegionBuckets(BucketStat), + UpdateReadStats(ReadStats), + UpdateWriteStats(WriteStats), + UpdateRegionCpuRecords(Arc), DestroyPeer { region_id: u64, }, + // In split.rs. AskBatchSplit { region: metapb::Region, split_keys: Vec>, @@ -50,24 +70,51 @@ pub enum Task { ReportBatchSplit { regions: Vec, }, + AutoSplit { + split_infos: Vec, + }, + // In misc.rs. UpdateMaxTimestamp { region_id: u64, initial_status: u64, txn_ext: Arc, }, + ReportMinResolvedTs { + store_id: u64, + min_resolved_ts: u64, + }, } impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match *self { + Task::StoreHeartbeat { ref stats, .. } => { + write!(f, "store heartbeat stats: {stats:?}") + } + Task::UpdateStoreInfos { + ref cpu_usages, + ref read_io_rates, + ref write_io_rates, + } => write!( + f, + "get store's information: cpu_usages {:?}, read_io_rates {:?}, write_io_rates {:?}", + cpu_usages, read_io_rates, write_io_rates, + ), Task::RegionHeartbeat(ref hb_task) => write!( f, "region heartbeat for region {:?}, leader {}", hb_task.region, hb_task.peer.get_id(), ), - Task::StoreHeartbeat { ref stats, .. } => { - write!(f, "store heartbeat stats: {:?}", stats) + Task::ReportRegionBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), + Task::UpdateReadStats(ref stats) => { + write!(f, "update read stats: {stats:?}") + } + Task::UpdateWriteStats(ref stats) => { + write!(f, "update write stats: {stats:?}") + } + Task::UpdateRegionCpuRecords(ref cpu_records) => { + write!(f, "get region cpu records: {:?}", cpu_records) } Task::DestroyPeer { ref region_id } => { write!(f, "destroy peer of region {}", region_id) @@ -83,11 +130,22 @@ impl Display for Task { KeysInfoFormatter(split_keys.iter()) ), Task::ReportBatchSplit { ref regions } => write!(f, "report split {:?}", regions), + Task::AutoSplit { ref split_infos } => { + write!(f, "auto split split regions, num is {}", split_infos.len()) + } Task::UpdateMaxTimestamp { region_id, .. } => write!( f, "update the max timestamp for region {} in the concurrency manager", region_id ), + Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + } => write!( + f, + "report min resolved ts: store {}, resolved ts {}", + store_id, min_resolved_ts, + ), } } } @@ -102,17 +160,20 @@ where pd_client: Arc, raft_engine: ER, tablet_registry: TabletRegistry, + snap_mgr: TabletSnapManager, router: StoreRouter, + stats_monitor: PdStatsMonitor, remote: Remote, - region_peers: HashMap, - - // For store_heartbeat. + // For store. start_ts: UnixSecs, - store_stat: store_heartbeat::StoreStat, + store_stat: store::StoreStat, - // For region_heartbeat. + // For region. + region_peers: HashMap, + region_buckets: HashMap, + // region_id -> total_cpu_time_ms (since last region heartbeat) region_cpu_records: HashMap, is_hb_receiver_scheduled: bool, @@ -122,6 +183,7 @@ where logger: Logger, shutdown: Arc, + cfg: Arc>, } impl Runner @@ -135,30 +197,51 @@ where pd_client: Arc, raft_engine: ER, tablet_registry: TabletRegistry, + snap_mgr: TabletSnapManager, router: StoreRouter, remote: Remote, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 + pd_scheduler: Scheduler, + auto_split_controller: AutoSplitController, + region_read_progress: RegionReadProgressRegistry, + collector_reg_handle: CollectorRegHandle, logger: Logger, shutdown: Arc, - ) -> Self { - Self { + cfg: Arc>, + ) -> Result { + let mut stats_monitor = PdStatsMonitor::new( + cfg.value().pd_store_heartbeat_tick_interval.0 / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, + cfg.value().report_min_resolved_ts_interval.0, + PdReporter::new(pd_scheduler, logger.clone()), + ); + stats_monitor.start( + auto_split_controller, + region_read_progress, + collector_reg_handle, + store_id, + )?; + Ok(Self { store_id, pd_client, raft_engine, tablet_registry, + snap_mgr, router, + stats_monitor, remote, - region_peers: HashMap::default(), start_ts: UnixSecs::zero(), - store_stat: store_heartbeat::StoreStat::default(), + store_stat: store::StoreStat::default(), + region_peers: HashMap::default(), + region_buckets: HashMap::default(), region_cpu_records: HashMap::default(), is_hb_receiver_scheduled: false, concurrency_manager, causal_ts_provider, logger, shutdown, - } + cfg, + }) } } @@ -173,8 +256,17 @@ where fn run(&mut self, task: Task) { self.maybe_schedule_heartbeat_receiver(); match task { - Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), Task::StoreHeartbeat { stats } => self.handle_store_heartbeat(stats), + Task::UpdateStoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + } => self.handle_update_store_infos(cpu_usages, read_io_rates, write_io_rates), + Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), + Task::ReportRegionBuckets(buckets) => self.handle_report_region_buckets(buckets), + Task::UpdateReadStats(stats) => self.handle_update_read_stats(stats), + Task::UpdateWriteStats(stats) => self.handle_update_write_stats(stats), + Task::UpdateRegionCpuRecords(records) => self.handle_update_region_cpu_records(records), Task::DestroyPeer { region_id } => self.handle_destroy_peer(region_id), Task::AskBatchSplit { region, @@ -184,51 +276,98 @@ where ch, } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), + Task::AutoSplit { split_infos } => self.handle_auto_split(split_infos), Task::UpdateMaxTimestamp { region_id, initial_status, txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), + Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + } => self.handle_report_min_resolved_ts(store_id, min_resolved_ts), } } } -impl Runner -where - EK: KvEngine, - ER: RaftEngine, - T: PdClient + 'static, -{ - fn handle_destroy_peer(&mut self, region_id: u64) { - match self.region_peers.remove(®ion_id) { - None => {} - Some(_) => { - info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) - } +#[derive(Clone)] +pub struct PdReporter { + scheduler: Scheduler, + logger: Logger, +} + +impl PdReporter { + pub fn new(scheduler: Scheduler, logger: Logger) -> Self { + PdReporter { scheduler, logger } + } +} + +impl FlowStatsReporter for PdReporter { + fn report_read_stats(&self, stats: ReadStats) { + if let Err(e) = self.scheduler.schedule(Task::UpdateReadStats(stats)) { + error!(self.logger, "Failed to send read flow statistics"; "err" => ?e); + } + } + + fn report_write_stats(&self, stats: WriteStats) { + if let Err(e) = self.scheduler.schedule(Task::UpdateWriteStats(stats)) { + error!(self.logger, "Failed to send write flow statistics"; "err" => ?e); } } } -#[derive(Clone)] -pub struct FlowReporter { - _scheduler: Scheduler, +impl Collector for PdReporter { + fn collect(&self, records: Arc) { + self.scheduler + .schedule(Task::UpdateRegionCpuRecords(records)) + .ok(); + } } -impl FlowReporter { - pub fn new(scheduler: Scheduler) -> Self { - FlowReporter { - _scheduler: scheduler, +impl StoreStatsReporter for PdReporter { + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + let task = Task::UpdateStoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send store infos to pd worker"; + "err" => ?e, + ); } } -} -impl FlowStatsReporter for FlowReporter { - fn report_read_stats(&self, _read_stats: ReadStats) { - // TODO + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64) { + let task = Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send min resolved ts to pd worker"; + "err" => ?e, + ); + } } - fn report_write_stats(&self, _write_stats: WriteStats) { - // TODO + fn auto_split(&self, split_infos: Vec) { + let task = Task::AutoSplit { split_infos }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send split infos to pd worker"; + "err" => ?e, + ); + } } } diff --git a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs b/components/raftstore-v2/src/worker/pd/region.rs similarity index 58% rename from components/raftstore-v2/src/worker/pd/region_heartbeat.rs rename to components/raftstore-v2/src/worker/pd/region.rs index 31f84801ed2..d282534329b 100644 --- a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -1,10 +1,15 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{sync::Arc, time::Duration}; +use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{metapb, pdpb}; -use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, PdClient, RegionStat}; +use pd_client::{ + merge_bucket_stats, metrics::PD_HEARTBEAT_COUNTER_VEC, BucketStat, PdClient, RegionStat, +}; +use raftstore::store::{ReadStats, WriteStats}; +use resource_metering::RawRecords; use slog::{debug, info}; use tikv_util::{store::QueryStats, time::UnixSecs}; @@ -44,6 +49,58 @@ pub struct PeerStat { pub approximate_size: u64, } +#[derive(Default)] +pub struct ReportBucket { + current_stat: BucketStat, + last_report_stat: Option, + last_report_ts: UnixSecs, +} + +impl ReportBucket { + fn new(current_stat: BucketStat) -> Self { + Self { + current_stat, + ..Default::default() + } + } + + fn report(&mut self, report_ts: UnixSecs) -> BucketStat { + self.last_report_ts = report_ts; + match self.last_report_stat.replace(self.current_stat.clone()) { + Some(last) => { + let mut delta = BucketStat::new( + self.current_stat.meta.clone(), + pd_client::new_bucket_stats(&self.current_stat.meta), + ); + // Buckets may be changed, recalculate last stats according to current meta. + merge_bucket_stats( + &delta.meta.keys, + &mut delta.stats, + &last.meta.keys, + &last.stats, + ); + for i in 0..delta.meta.keys.len() - 1 { + delta.stats.write_bytes[i] = + self.current_stat.stats.write_bytes[i] - delta.stats.write_bytes[i]; + delta.stats.write_keys[i] = + self.current_stat.stats.write_keys[i] - delta.stats.write_keys[i]; + delta.stats.write_qps[i] = + self.current_stat.stats.write_qps[i] - delta.stats.write_qps[i]; + + delta.stats.read_bytes[i] = + self.current_stat.stats.read_bytes[i] - delta.stats.read_bytes[i]; + delta.stats.read_keys[i] = + self.current_stat.stats.read_keys[i] - delta.stats.read_keys[i]; + delta.stats.read_qps[i] = + self.current_stat.stats.read_qps[i] - delta.stats.read_qps[i]; + } + delta + } + None => self.current_stat.clone(), + } + } +} + impl Runner where EK: KvEngine, @@ -244,4 +301,123 @@ where self.remote.spawn(f); self.is_hb_receiver_scheduled = true; } + + pub fn handle_report_region_buckets(&mut self, region_buckets: BucketStat) { + let region_id = region_buckets.meta.region_id; + self.merge_buckets(region_buckets); + let report_buckets = self.region_buckets.get_mut(®ion_id).unwrap(); + let last_report_ts = if report_buckets.last_report_ts.is_zero() { + self.start_ts + } else { + report_buckets.last_report_ts + }; + let now = UnixSecs::now(); + let interval_second = now.into_inner() - last_report_ts.into_inner(); + let delta = report_buckets.report(now); + let resp = self + .pd_client + .report_region_buckets(&delta, Duration::from_secs(interval_second)); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + debug!( + logger, + "failed to send buckets"; + "region_id" => region_id, + "version" => delta.meta.version, + "region_epoch" => ?delta.meta.region_epoch, + "err" => ?e + ); + } + }; + self.remote.spawn(f); + } + + pub fn handle_update_read_stats(&mut self, mut stats: ReadStats) { + for (region_id, region_info) in stats.region_infos.iter_mut() { + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); + peer_stat.read_bytes += region_info.flow.read_bytes as u64; + peer_stat.read_keys += region_info.flow.read_keys as u64; + self.store_stat.engine_total_bytes_read += region_info.flow.read_bytes as u64; + self.store_stat.engine_total_keys_read += region_info.flow.read_keys as u64; + peer_stat + .query_stats + .add_query_stats(®ion_info.query_stats.0); + self.store_stat + .engine_total_query_num + .add_query_stats(®ion_info.query_stats.0); + } + for (_, region_buckets) in std::mem::take(&mut stats.region_buckets) { + self.merge_buckets(region_buckets); + } + if !stats.region_infos.is_empty() { + self.stats_monitor.maybe_send_read_stats(stats); + } + } + + pub fn handle_update_write_stats(&mut self, mut stats: WriteStats) { + for (region_id, region_info) in stats.region_infos.iter_mut() { + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); + peer_stat.query_stats.add_query_stats(®ion_info.0); + self.store_stat + .engine_total_query_num + .add_query_stats(®ion_info.0); + } + } + + pub fn handle_update_region_cpu_records(&mut self, records: Arc) { + // Send Region CPU info to AutoSplitController inside the stats_monitor. + self.stats_monitor.maybe_send_cpu_stats(&records); + Self::calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); + } + + pub fn handle_destroy_peer(&mut self, region_id: u64) { + match self.region_peers.remove(®ion_id) { + None => {} + Some(_) => { + info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) + } + } + } + + fn merge_buckets(&mut self, mut buckets: BucketStat) { + let region_id = buckets.meta.region_id; + self.region_buckets + .entry(region_id) + .and_modify(|report_bucket| { + let current = &mut report_bucket.current_stat; + if current.meta < buckets.meta { + std::mem::swap(current, &mut buckets); + } + + merge_bucket_stats( + ¤t.meta.keys, + &mut current.stats, + &buckets.meta.keys, + &buckets.stats, + ); + }) + .or_insert_with(|| ReportBucket::new(buckets)); + } + + fn calculate_region_cpu_records( + store_id: u64, + records: Arc, + region_cpu_records: &mut HashMap, + ) { + for (tag, record) in &records.records { + let record_store_id = tag.store_id; + if record_store_id != store_id { + continue; + } + // Reporting a region heartbeat later will clear the corresponding record. + *region_cpu_records.entry(tag.region_id).or_insert(0) += record.cpu_time; + } + } } diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index cb7c3ad9308..bf13e01120a 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -6,10 +6,12 @@ use kvproto::{ raft_cmdpb::{AdminCmdType, AdminRequest, SplitRequest}, }; use pd_client::PdClient; -use slog::{info, warn}; +use raftstore::store::SplitInfo; +use slog::{info, warn, Logger}; +use yatp::{task::future::TaskCell, Remote}; use super::{requests::*, Runner}; -use crate::router::CmdResChannel; +use crate::{batch::StoreRouter, router::CmdResChannel}; fn new_batch_split_region_request( split_keys: Vec>, @@ -37,24 +39,50 @@ where ER: RaftEngine, T: PdClient + 'static, { + #[inline] pub fn handle_ask_batch_split( &mut self, - mut region: metapb::Region, + region: metapb::Region, split_keys: Vec>, peer: metapb::Peer, right_derive: bool, ch: CmdResChannel, + ) { + Self::ask_batch_split_imp( + &self.pd_client, + &self.logger, + &self.router, + &self.remote, + region, + split_keys, + peer, + right_derive, + Some(ch), + ); + } + + fn ask_batch_split_imp( + pd_client: &T, + logger: &Logger, + router: &StoreRouter, + remote: &Remote, + mut region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + ch: Option, ) { if split_keys.is_empty() { - info!(self.logger, "empty split key, skip ask batch split"; - "region_id" => region.get_id()); + info!( + logger, + "empty split key, skip ask batch split"; + "region_id" => region.get_id() + ); return; } - let resp = self - .pd_client - .ask_batch_split(region.clone(), split_keys.len()); - let router = self.router.clone(); - let logger = self.logger.clone(); + let resp = pd_client.ask_batch_split(region.clone(), split_keys.len()); + let router = router.clone(); + let logger = logger.clone(); let f = async move { match resp.await { Ok(mut resp) => { @@ -73,7 +101,7 @@ where ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); - send_admin_request(&logger, &router, region_id, epoch, peer, req, Some(ch)); + send_admin_request(&logger, &router, region_id, epoch, peer, req, ch); } Err(e) => { warn!( @@ -85,7 +113,7 @@ where } } }; - self.remote.spawn(f); + remote.spawn(f); } pub fn handle_report_batch_split(&mut self, regions: Vec) { @@ -98,4 +126,37 @@ where }; self.remote.spawn(f); } + + pub fn handle_auto_split(&mut self, split_infos: Vec) { + let pd_client = self.pd_client.clone(); + let logger = self.logger.clone(); + let router = self.router.clone(); + let remote = self.remote.clone(); + + let f = async move { + for split_info in split_infos { + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { + Self::ask_batch_split_imp( + &pd_client, + &logger, + &router, + &remote, + region, + vec![split_key], + split_info.peer, + true, + None, + ); + // Try to split the region on half within the given key + // range if there is no `split_key` been given. + } else if split_info.start_key.is_some() && split_info.end_key.is_some() { + // TODO: implement half split + } + } + }; + self.remote.spawn(f); + } } diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store.rs similarity index 92% rename from components/raftstore-v2/src/worker/pd/store_heartbeat.rs rename to components/raftstore-v2/src/worker/pd/store.rs index 2fbe378cff8..8f30b85d6f3 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -257,6 +257,17 @@ where self.remote.spawn(f); } + pub fn handle_update_store_infos( + &mut self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + self.store_stat.store_cpu_usages = cpu_usages; + self.store_stat.store_read_io_rates = read_io_rates; + self.store_stat.store_write_io_rates = write_io_rates; + } + /// Returns (capacity, used, available). fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { @@ -272,11 +283,19 @@ where Ok(stats) => stats, }; let disk_cap = disk_stats.total_space(); - // TODO: custom capacity. - let capacity = disk_cap; - // TODO: accurate snapshot size and kv engines size. - let snap_size = 0; - let kv_size = 0; + let capacity = if self.cfg.value().capacity.0 == 0 { + disk_cap + } else { + std::cmp::min(disk_cap, self.cfg.value().capacity.0) + }; + let mut kv_size = 0; + self.tablet_registry.for_each_opened_tablet(|_, cached| { + if let Some(tablet) = cached.latest() { + kv_size += tablet.get_engine_used_size().unwrap_or(0); + } + true + }); + let snap_size = self.snap_mgr.total_snap_size().unwrap(); let used_size = snap_size + kv_size + self diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index cc1fcd971e9..d6d19743b1e 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -9,7 +9,7 @@ use std::{ use collections::HashMap; use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry}; use kvproto::metapb::Region; -use slog::{error, warn, Logger}; +use slog::{debug, error, warn, Logger}; use tikv_util::worker::{Runnable, RunnableWithTimer}; pub enum Task { @@ -17,6 +17,7 @@ pub enum Task { tablet: EK, start_key: Box<[u8]>, end_key: Box<[u8]>, + cb: Box, }, PrepareDestroy { tablet: EK, @@ -31,11 +32,9 @@ pub enum Task { impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - match *self { + match self { Task::Trim { - ref start_key, - ref end_key, - .. + start_key, end_key, .. } => write!( f, "trim tablet for start_key {}, end_key {}", @@ -65,11 +64,12 @@ impl Display for Task { impl Task { #[inline] - pub fn trim(tablet: EK, region: &Region) -> Self { + pub fn trim(tablet: EK, region: &Region, cb: impl FnOnce() + Send + 'static) -> Self { Task::Trim { tablet, start_key: region.get_start_key().into(), end_key: region.get_end_key().into(), + cb: Box::new(cb), } } @@ -110,7 +110,12 @@ impl Runner { } } - fn trim(tablet: &EK, start_key: &[u8], end_key: &[u8]) -> engine_traits::Result<()> { + fn trim( + tablet: &EK, + start_key: &[u8], + end_key: &[u8], + cb: Box, + ) -> engine_traits::Result<()> { let start_key = keys::data_key(start_key); let end_key = keys::data_end_key(end_key); let range1 = Range::new(&[], &start_key); @@ -121,10 +126,13 @@ impl Runner { for r in [range1, range2] { tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1)?; } + cb(); Ok(()) } fn prepare_destroy(&mut self, region_id: u64, tablet: EK, wait_for_persisted: u64) { + // The tablet is about to be deleted, flush is a waste and will block destroy. + let _ = tablet.set_db_options(&[("avoid_flush_during_shutdown", "true")]); let _ = tablet.pause_background_work(); self.waiting_destroy_tasks .entry(region_id) @@ -156,10 +164,15 @@ impl Runner { "path" => path.display(), ), Ok(false) => { + let (_, region_id, tablet_index) = + registry.parse_tablet_name(path).unwrap_or(("", 0, 0)); // TODO: use a meaningful table context. let _ = registry .tablet_factory() - .destroy_tablet(TabletContext::with_infinite_region(0, None), path) + .destroy_tablet( + TabletContext::with_infinite_region(region_id, Some(tablet_index)), + path, + ) .map_err(|e| { warn!( logger, @@ -170,7 +183,9 @@ impl Runner { }); return true; } - _ => {} + Ok(true) => { + debug!(logger, "ignore locked tablet"; "path" => path.display()); + } } false } @@ -188,8 +203,9 @@ where tablet, start_key, end_key, + cb, } => { - if let Err(e) = Self::trim(&tablet, &start_key, &end_key) { + if let Err(e) = Self::trim(&tablet, &start_key, &end_key, cb) { error!( self.logger, "failed to trim tablet"; @@ -222,6 +238,6 @@ where } fn get_interval(&self) -> Duration { - Duration::from_secs(2) + Duration::from_secs(10) } } diff --git a/components/raftstore-v2/tests/failpoints/test_split.rs b/components/raftstore-v2/tests/failpoints/test_split.rs index 79356ae5805..e67041ab181 100644 --- a/components/raftstore-v2/tests/failpoints/test_split.rs +++ b/components/raftstore-v2/tests/failpoints/test_split.rs @@ -82,6 +82,9 @@ fn test_restart_resume() { .new_request_for(split_region_id) .take_header() .take_region_epoch(); + // Split will be resumed for region 2, not removing the fp will make write block + // forever. + fail::remove(fp); let timer = Instant::now(); for (region_id, key, val) in cases { let mut put = SimpleWriteEncoder::with_capacity(64); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 4c025a0fc85..2076272b44b 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -33,14 +33,16 @@ use raftstore::{ coprocessor::CoprocessorHost, store::{ region_meta::{RegionLocalState, RegionMeta}, - Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, + AutoSplitController, Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, + RAFT_INIT_LOG_INDEX, }, }; use raftstore_v2::{ create_store_batch_system, - router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, + router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter, StoreMsg}, Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, }; +use resource_metering::CollectorRegHandle; use slog::{debug, o, Logger}; use tempfile::TempDir; use test_pd::mocker::Service; @@ -125,7 +127,16 @@ impl TestRouter { let res = self.send(region_id, PeerMsg::WaitFlush(ch)); match res { Ok(_) => return block_on(sub.result()).is_some(), - Err(TrySendError::Disconnected(_)) => return false, + Err(TrySendError::Disconnected(m)) => { + let PeerMsg::WaitFlush(ch) = m else { unreachable!() }; + match self + .store_router() + .send_control(StoreMsg::WaitFlush { region_id, ch }) + { + Ok(_) => return block_on(sub.result()).is_some(), + Err(_) => return false, + } + } Err(TrySendError::Full(_)) => thread::sleep(Duration::from_millis(10)), } } @@ -276,7 +287,7 @@ impl RunningState { factory.open_tablet(ctx, &path).unwrap(); } - let router = RaftRouter::new(store_id, registry.clone(), router); + let router = RaftRouter::new(store_id, router); let store_meta = router.store_meta().clone(); let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()).unwrap(); @@ -300,6 +311,8 @@ impl RunningState { concurrency_manager, causal_ts_provider, coprocessor_host, + AutoSplitController::default(), + CollectorRegHandle::new_for_test(), background.clone(), pd_worker, ) diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 8a075bb9a35..4b3445a00ad 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -2,8 +2,9 @@ use std::{self, time::Duration}; -use engine_traits::{Peekable, CF_DEFAULT}; -use kvproto::raft_cmdpb::AdminCmdType; +use engine_traits::{Peekable, RaftEngineReadOnly, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::PeerState}; use raft::prelude::ConfChangeType; use raftstore_v2::{ router::{PeerMsg, PeerTick}, @@ -102,3 +103,78 @@ fn test_simple_change() { let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); check_skip_wal(cached.latest().unwrap().as_inner().path()); } + +/// Test if a peer can be destroyed by conf change if logs after conf change are +/// also replicated. +#[test] +fn test_remove_by_conf_change() { + let cluster = Cluster::with_node_count(2, None); + let region_id = 2; + let mut req = cluster.routers[0].new_request_for(2); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + let store_id = cluster.node(1).id(); + let new_peer = new_learner_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(new_peer); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + // So heartbeat will create a learner. + cluster.dispatch(2, vec![]); + // Trigger the raft tick to replica the log to the learner and execute the + // snapshot task. + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + cluster.dispatch(region_id, vec![]); + // Wait some time so snapshot can be generated. + std::thread::sleep(Duration::from_millis(100)); + cluster.dispatch(region_id, vec![]); + + // write one kv to make flow control replicated. + let (key, val) = (b"key", b"value"); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, _) = PeerMsg::simple_write(header, put.encode()); + cluster.routers[0].send(region_id, msg).unwrap(); + cluster.dispatch(region_id, vec![]); + + let new_conf_ver = req.get_header().get_region_epoch().get_conf_ver() + 1; + req.mut_header() + .mut_region_epoch() + .set_conf_ver(new_conf_ver); + req.mut_admin_request() + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + let (admin_msg, admin_sub) = PeerMsg::admin_command(req.clone()); + // write one kv after removal + let (key, val) = (b"key1", b"value"); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + // Send them at the same time so they will be all sent to learner. + cluster.routers[0].send(region_id, admin_msg).unwrap(); + cluster.routers[0].send(region_id, msg).unwrap(); + let resp = block_on(admin_sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // Dispatch messages so the learner will receive conf remove and write at the + // same time. + cluster.dispatch(region_id, vec![]); + cluster.routers[1].wait_flush(region_id, Duration::from_millis(300)); + // Wait for apply. + std::thread::sleep(Duration::from_millis(100)); + let raft_engine = &cluster.node(1).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_state(), PeerState::Tombstone); + assert_eq!(raft_engine.get_raft_state(region_id).unwrap(), None); +} diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index 96bcbbccf7a..09ead81c0c2 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -52,6 +52,7 @@ fn test_store_heartbeat() { let stats = block_on(cluster.node(0).pd_client().get_store_stats_async(store_id)).unwrap(); if stats.get_start_time() > 0 { assert_ne!(stats.get_capacity(), 0); + assert_ne!(stats.get_used_size(), 0); return; } std::thread::sleep(std::time::Duration::from_millis(50)); diff --git a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs index d031d6b1eba..18d81ef16aa 100644 --- a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs +++ b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{assert_matches::assert_matches, time::Duration}; use engine_traits::{Peekable, CF_DEFAULT}; use futures::executor::block_on; @@ -9,35 +9,32 @@ use kvproto::{ raft_cmdpb::{AdminCmdType, TransferLeaderRequest}, }; use raft::prelude::ConfChangeType; -use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; use tikv_util::store::new_peer; use crate::cluster::Cluster; fn put_data( region_id: u64, - cluster: &Cluster, + cluster: &mut Cluster, node_off: usize, node_off_for_verify: usize, key: &[u8], ) { - let router = &cluster.routers[node_off]; + let mut router = &mut cluster.routers[node_off]; router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); // router.wait_applied_to_current_term(2, Duration::from_secs(3)); - let tablet_registry = cluster.node(node_off).tablet_registry(); - let tablet = tablet_registry - .get(region_id) - .unwrap() - .latest() - .unwrap() - .clone(); - assert!(tablet.get_value(key).unwrap().is_none()); + let snap = router.stale_snapshot(region_id); + assert_matches!(snap.get_value(key), Ok(None)); let header = Box::new(router.new_request_for(region_id).take_header()); let mut put = SimpleWriteEncoder::with_capacity(64); - put.put(CF_DEFAULT, &key[1..], b"value"); + put.put(CF_DEFAULT, key, b"value"); let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); router.send(region_id, msg).unwrap(); std::thread::sleep(std::time::Duration::from_millis(10)); @@ -53,17 +50,29 @@ fn put_data( let resp = block_on(sub.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); - - // Verify the data is ready in the other node - let tablet_registry = cluster.node(node_off_for_verify).tablet_registry(); - let tablet = tablet_registry - .get(region_id) - .unwrap() - .latest() - .unwrap() - .clone(); - assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); + router = &mut cluster.routers[node_off]; + let snap = router.stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), b"value"); + + // Because of skip bcast commit, the data should not be applied yet. + router = &mut cluster.routers[node_off_for_verify]; + let snap = router.stale_snapshot(region_id); + assert_matches!(snap.get_value(key), Ok(None)); + // Trigger heartbeat explicitly to commit on follower. + router = &mut cluster.routers[node_off]; + for _ in 0..2 { + router + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + router + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + } + cluster.dispatch(region_id, vec![]); + std::thread::sleep(std::time::Duration::from_millis(100)); + router = &mut cluster.routers[node_off_for_verify]; + let snap = router.stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), b"value"); } pub fn must_transfer_leader( @@ -97,7 +106,7 @@ pub fn must_transfer_leader( #[test] fn test_transfer_leader() { - let cluster = Cluster::with_node_count(3, None); + let mut cluster = Cluster::with_node_count(3, None); let region_id = 2; let router0 = &cluster.routers[0]; @@ -137,13 +146,13 @@ fn test_transfer_leader() { cluster.dispatch(region_id, vec![]); // Ensure follower has latest entries before transfer leader. - put_data(region_id, &cluster, 0, 1, b"zkey1"); + put_data(region_id, &mut cluster, 0, 1, b"key1"); // Perform transfer leader must_transfer_leader(&cluster, region_id, 0, 1, peer1); // Before transfer back to peer0, put some data again. - put_data(region_id, &cluster, 1, 0, b"zkey2"); + put_data(region_id, &mut cluster, 1, 0, b"key2"); // Perform transfer leader let store_id = cluster.node(0).id(); diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 548693b71ac..8df501f279d 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -72,6 +72,7 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0", default-features = false } rand = "0.8.3" +resource_control = { workspace = true } resource_metering = { workspace = true } serde = "1.0" serde_derive = "1.0" diff --git a/components/raftstore/src/coprocessor/consistency_check.rs b/components/raftstore/src/coprocessor/consistency_check.rs index 5ba97089f85..2ebf27c963f 100644 --- a/components/raftstore/src/coprocessor/consistency_check.rs +++ b/components/raftstore/src/coprocessor/consistency_check.rs @@ -2,7 +2,7 @@ use std::marker::PhantomData; -use engine_traits::{KvEngine, Snapshot, ALL_CFS, CF_RAFT}; +use engine_traits::{KvEngine, Snapshot, CF_RAFT}; use kvproto::metapb::Region; use crate::{ @@ -63,7 +63,7 @@ fn compute_hash_on_raw(region: &Region, snap: &S) -> Result { let start_key = keys::enc_start_key(region); let end_key = keys::enc_end_key(region); - for cf in ALL_CFS { + for cf in snap.cf_names() { snap.scan(cf, &start_key, &end_key, false, |k, v| { digest.update(k); digest.update(v); diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 794a46b8e3a..0e45ef1d09d 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -8,6 +8,7 @@ use kvproto::{ metapb::{Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, + raft_serverpb::RaftMessage, }; use protobuf::Message; use raft::eraftpb; @@ -278,6 +279,7 @@ impl_box_observer_g!( ConsistencyCheckObserver, WrappedConsistencyCheckObserver ); +impl_box_observer!(BoxMessageObserver, MessageObserver, WrappedMessageObserver); /// Registry contains all registered coprocessors. #[derive(Clone)] @@ -296,6 +298,7 @@ where read_index_observers: Vec>, pd_task_observers: Vec>, update_safe_ts_observers: Vec>, + message_observers: Vec>, // TODO: add endpoint } @@ -313,6 +316,7 @@ impl Default for Registry { read_index_observers: Default::default(), pd_task_observers: Default::default(), update_safe_ts_observers: Default::default(), + message_observers: Default::default(), } } } @@ -381,6 +385,10 @@ impl Registry { pub fn register_update_safe_ts_observer(&mut self, priority: u32, qo: BoxUpdateSafeTsObserver) { push!(priority, qo, self.update_safe_ts_observers); } + + pub fn register_message_observer(&mut self, priority: u32, qo: BoxMessageObserver) { + push!(priority, qo, self.message_observers); + } } /// A macro that loops over all observers and returns early when error is found @@ -780,6 +788,17 @@ impl CoprocessorHost { true } + /// Returns false if the message should not be stepped later. + pub fn on_raft_message(&self, msg: &RaftMessage) -> bool { + for observer in &self.registry.message_observers { + let observer = observer.observer.inner(); + if !observer.on_raft_message(msg) { + return false; + } + } + true + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -890,6 +909,7 @@ mod tests { OnUpdateSafeTs = 23, PrePersist = 24, PreWriteApplyState = 25, + OnRaftMessage = 26, } impl Coprocessor for TestCoprocessor {} @@ -1132,6 +1152,14 @@ mod tests { } } + impl MessageObserver for TestCoprocessor { + fn on_raft_message(&self, _: &RaftMessage) -> bool { + self.called + .fetch_add(ObserverIndex::OnRaftMessage as usize, Ordering::SeqCst); + true + } + } + macro_rules! assert_all { ($target:expr, $expect:expr) => {{ for (c, e) in ($target).iter().zip($expect) { @@ -1168,6 +1196,8 @@ mod tests { .register_cmd_observer(1, BoxCmdObserver::new(ob.clone())); host.registry .register_update_safe_ts_observer(1, BoxUpdateSafeTsObserver::new(ob.clone())); + host.registry + .register_message_observer(1, BoxMessageObserver::new(ob.clone())); let mut index: usize = 0; let region = Region::default(); @@ -1282,6 +1312,11 @@ mod tests { host.pre_write_apply_state(®ion); index += ObserverIndex::PreWriteApplyState as usize; assert_all!([&ob.called], &[index]); + + let msg = RaftMessage::default(); + host.on_raft_message(&msg); + index += ObserverIndex::OnRaftMessage as usize; + assert_all!([&ob.called], &[index]); } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 5100e9d4632..98b045dbed8 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -26,14 +26,16 @@ mod metrics; pub mod region_info_accessor; mod split_check; pub mod split_observer; +use kvproto::raft_serverpb::RaftMessage; pub use self::{ config::{Config, ConsistencyCheckMethod}, consistency_check::{ConsistencyCheckObserver, Raw as RawConsistencyCheckObserver}, dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, - BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, - BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, StoreHandle, + BoxMessageObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, + BoxRoleObserver, BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, + StoreHandle, }, error::{Error, Result}, region_info_accessor::{ @@ -269,6 +271,7 @@ pub struct RoleChange { /// Which peer is voted by itself. pub vote: u64, pub initialized: bool, + pub peer_id: u64, } impl RoleChange { @@ -280,6 +283,7 @@ impl RoleChange { prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, initialized: true, + peer_id: raft::INVALID_ID, } } } @@ -300,6 +304,7 @@ pub enum RegionChangeReason { PrepareMerge, CommitMerge, RollbackMerge, + SwitchWitness, } #[derive(Clone, Copy, Debug, PartialEq)] @@ -333,6 +338,13 @@ pub trait RegionChangeObserver: Coprocessor { } } +pub trait MessageObserver: Coprocessor { + /// Returns false if the message should not be stepped later. + fn on_raft_message(&self, _: &RaftMessage) -> bool { + true + } +} + #[derive(Clone, Debug, Default)] pub struct Cmd { pub index: u64, diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 3c415c65af6..36fcec7f1f3 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -140,6 +140,9 @@ pub enum Error { region_id: u64, local_state: raft_serverpb::RegionLocalState, }, + + #[error("peer is a witness of region {0}")] + IsWitness(u64), } pub type Result = result::Result; @@ -263,6 +266,11 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_flashback_not_prepared(e); } + Error::IsWitness(region_id) => { + let mut e = errorpb::IsWitness::default(); + e.set_region_id(region_id); + errorpb.set_is_witness(e); + } _ => {} }; @@ -319,6 +327,7 @@ impl ErrorCodeExt for Error { Error::DataIsNotReady { .. } => error_code::raftstore::DATA_IS_NOT_READY, Error::DeadlineExceeded => error_code::raftstore::DEADLINE_EXCEEDED, Error::PendingPrepareMerge => error_code::raftstore::PENDING_PREPARE_MERGE, + Error::IsWitness(..) => error_code::raftstore::IS_WITNESS, Error::Other(_) | Error::RegionNotRegistered { .. } => error_code::raftstore::UNKNOWN, } diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index 6104ae7b7cf..1db5f79d226 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -8,6 +8,7 @@ #![feature(hash_drain_filter)] #![feature(let_chains)] #![feature(assert_matches)] +#![feature(type_alias_impl_trait)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index b298ed3529e..45492feb294 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -227,10 +227,10 @@ where error!("failed to create checkpointer"; "region_id" => region_id, "error" => %e); SNAP_COUNTER.generate.fail.inc(); } else { + let elapsed = start.saturating_elapsed_secs(); SNAP_COUNTER.generate.success.inc(); - SNAP_HISTOGRAM - .generate - .observe(start.saturating_elapsed_secs()); + SNAP_HISTOGRAM.generate.observe(elapsed); + info!("snapshot generated"; "region_id" => region_id, "elapsed" => elapsed, "key" => ?snap_key, "for_balance" => for_balance); res = Some(Box::new((snapshot, to_peer))) } diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index b4cceb96a82..98c76ddd6d1 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -186,8 +186,8 @@ where pub raft_wb: Option, // called after writing to kvdb and raftdb. pub persisted_cbs: Vec>, - pub entries: Vec, - pub cut_logs: Option<(u64, u64)>, + overwrite_to: Option, + entries: Vec, pub raft_state: Option, pub extra_write: ExtraWrite, pub messages: Vec, @@ -207,8 +207,8 @@ where ready_number, send_time: Instant::now(), raft_wb: None, + overwrite_to: None, entries: vec![], - cut_logs: None, raft_state: None, extra_write: ExtraWrite::None, messages: vec![], @@ -221,11 +221,21 @@ where pub fn has_data(&self) -> bool { !(self.raft_state.is_none() && self.entries.is_empty() - && self.cut_logs.is_none() && self.extra_write.is_empty() && self.raft_wb.as_ref().map_or(true, |wb| wb.is_empty())) } + /// Append continous entries. + /// + /// All existing entries with same index will be overwritten. If + /// `overwrite_to` is set to a larger value, then entries in + /// `[entries.last().get_index(), overwrite_to)` will be deleted. If + /// entries is empty, nothing will be deleted. + pub fn set_append(&mut self, overwrite_to: Option, entries: Vec) { + self.entries = entries; + self.overwrite_to = overwrite_to; + } + #[inline] pub fn ready_number(&self) -> u64 { self.ready_number @@ -387,11 +397,12 @@ where raft_wb.merge(wb).unwrap(); } raft_wb - .append(task.region_id, std::mem::take(&mut task.entries)) + .append( + task.region_id, + task.overwrite_to, + std::mem::take(&mut task.entries), + ) .unwrap(); - if let Some((from, to)) = task.cut_logs { - raft_wb.cut_logs(task.region_id, from, to); - } if let Some(raft_state) = task.raft_state.take() && self.raft_states.insert(task.region_id, raft_state).is_none() { @@ -454,11 +465,12 @@ where self.flush_states_to_raft_wb(); if metrics.waterfall_metrics { let now = std::time::Instant::now(); - for task in &self.tasks { - for tracker in &task.trackers { + for task in &mut self.tasks { + for tracker in &mut task.trackers { tracker.observe(now, &metrics.wf_before_write, |t| { &mut t.metrics.wf_before_write_nanos }); + tracker.reset(now); } } } @@ -538,7 +550,7 @@ where ) -> Self { let batch = WriteTaskBatch::new(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); let perf_context = - raft_engine.get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); + ER::get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); let cfg_tracker = cfg.clone().tracker(tag.clone()); Self { store_id, @@ -901,7 +913,6 @@ where } /// Used for test to write task to kv db and raft db. -#[cfg(test)] pub fn write_to_db_for_test( engines: &engine_traits::Engines, task: WriteTask, @@ -911,7 +922,8 @@ pub fn write_to_db_for_test( { let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); batch.add_write_task(&engines.raft, task); - batch.before_write_to_db(&StoreWriteMetrics::new(false)); + let metrics = StoreWriteMetrics::new(false); + batch.before_write_to_db(&metrics); if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { if !kv_wb.is_empty() { let mut write_opts = WriteOptions::new(); @@ -928,6 +940,8 @@ pub fn write_to_db_for_test( }); } } + batch.after_write_to_raft_db(&metrics); + batch.after_write_all(); } #[cfg(test)] diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 6007b39489e..d1861a8903c 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -167,7 +167,9 @@ fn delete_kv(wb: Option<&mut TestKvWriteBatch>, key: &[u8]) { /// Simulate kv puts on raft engine. fn put_raft_kv(wb: Option<&mut TestRaftLogBatch>, key: u64) { - wb.unwrap().append(key, vec![new_entry(key, key)]).unwrap(); + wb.unwrap() + .append(key, None, vec![new_entry(key, key)]) + .unwrap(); } fn delete_raft_kv(engine: &RaftTestEngine, wb: Option<&mut TestRaftLogBatch>, key: u64) { @@ -294,10 +296,7 @@ fn test_worker() { put_kv(task_3.extra_write.v1_mut(), b"kv_k3", b"kv_v3"); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3 - .entries - .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); - task_3.cut_logs = Some((8, 9)); + task_3.set_append(Some(9), vec![new_entry(6, 6), new_entry(7, 7)]); task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); task_3 .messages @@ -392,10 +391,7 @@ fn test_worker_split_raft_wb() { lb.put_apply_state(region_1, 25, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), raft_key_3); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), raft_key_1); - task_3 - .entries - .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); - task_3.cut_logs = Some((8, 9)); + task_3.set_append(Some(9), vec![new_entry(6, 6), new_entry(7, 7)]); task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); if split.1 { expected_wbs += 1; @@ -500,8 +496,7 @@ fn test_basic_flow() { delete_kv(task_3.extra_write.v1_mut(), b"kv_k1"); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3.entries.append(&mut vec![new_entry(6, 6)]); - task_3.cut_logs = Some((7, 8)); + task_3.set_append(Some(8), vec![new_entry(6, 6)]); task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); task_3 .messages @@ -603,8 +598,7 @@ fn test_basic_flow_with_states() { lb.put_apply_state(region_1, 5, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3.entries.append(&mut vec![new_entry(6, 6)]); - task_3.cut_logs = Some((7, 8)); + task_3.set_append(Some(8), vec![new_entry(6, 6)]); task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); task_3 .messages diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 454cf61a4c8..d6994a16ed4 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -68,6 +68,9 @@ pub struct Config { pub raft_log_compact_sync_interval: ReadableDuration, // Interval to gc unnecessary raft log. pub raft_log_gc_tick_interval: ReadableDuration, + // Interval to request voter_replicated_index for gc unnecessary raft log, + // if the leader has not initiated gc for a long time. + pub request_voter_replicated_index_interval: ReadableDuration, // A threshold to gc stale raft log, must >= 1. pub raft_log_gc_threshold: u64, // When entry count exceed this value, gc will be forced trigger. @@ -321,6 +324,12 @@ pub struct Config { #[online_config(hidden)] // Interval to check peers availability info. pub check_peers_availability_interval: ReadableDuration, + + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(hidden)] + // Interval to check if need to request snapshot. + pub check_request_snapshot_interval: ReadableDuration, } impl Default for Config { @@ -339,6 +348,7 @@ impl Default for Config { raft_entry_max_size: ReadableSize::mb(8), raft_log_compact_sync_interval: ReadableDuration::secs(2), raft_log_gc_tick_interval: ReadableDuration::secs(3), + request_voter_replicated_index_interval: ReadableDuration::minutes(5), raft_log_gc_threshold: 50, raft_log_gc_count_limit: None, raft_log_gc_size_limit: None, @@ -429,6 +439,8 @@ impl Default for Config { unreachable_backoff: ReadableDuration::secs(10), // TODO: make its value reasonable check_peers_availability_interval: ReadableDuration::secs(30), + // TODO: make its value reasonable + check_request_snapshot_interval: ReadableDuration::minutes(1), } } } @@ -648,7 +660,7 @@ impl Config { // prevent mistakenly inputting too large values, the max limit is made // according to the cpu quota * 10. Notice 10 is only an estimate, not an // empirical value. - let limit = SysQuota::cpu_cores_quota() as usize * 10; + let limit = (SysQuota::cpu_cores_quota() * 10.0) as usize; if self.apply_batch_system.pool_size == 0 || self.apply_batch_system.pool_size > limit { return Err(box_err!( "apply-pool-size should be greater than 0 and less than or equal to: {}", @@ -813,6 +825,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_log_gc_tick_interval"]) .set(self.raft_log_gc_tick_interval.as_secs_f64()); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["request_voter_replicated_index_interval"]) + .set(self.request_voter_replicated_index_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_log_gc_threshold"]) .set(self.raft_log_gc_threshold as f64); diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index c6278c890f7..afa13730ccf 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -69,6 +69,13 @@ impl CachedEntries { } } + pub fn iter_entries(&self, mut f: impl FnMut(&Entry)) { + let entries = self.entries.lock().unwrap(); + for entry in &entries.0 { + f(entry); + } + } + /// Take cached entries and dangle size for them. `dangle` means not in /// entry cache. pub fn take_entries(&self) -> (Vec, usize) { @@ -1075,9 +1082,8 @@ impl EntryStorage { self.cache.append(self.region_id, self.peer_id, &entries); - task.entries = entries; // Delete any previously appended log entries which never committed. - task.cut_logs = Some((last_index + 1, prev_last_index + 1)); + task.set_append(Some(prev_last_index + 1), entries); self.raft_state.set_last_index(last_index); self.last_term = last_term; @@ -1227,6 +1233,10 @@ impl EntryStorage { let idx = cache.cache[drain_to].index; let mem_size_change = cache.compact_to(idx + 1); RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); + } else if !half { + let cache = &mut self.cache; + let mem_size_change = cache.compact_to(u64::MAX); + RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); } } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index affa0205e8f..bb262b9ffa8 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -24,7 +24,7 @@ use std::{ use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, Config as BatchSystemConfig, Fsm, HandleResult, - HandlerBuilder, PollHandler, Priority, + HandlerBuilder, PollHandler, Priority, ResourceMetered, }; use collections::{HashMap, HashMapEntry, HashSet}; use crossbeam::channel::{TryRecvError, TrySendError}; @@ -40,17 +40,18 @@ use kvproto::{ metapb::{self, PeerRole, Region, RegionEpoch}, raft_cmdpb::{ AdminCmdType, AdminRequest, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, - RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, + RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, SwitchWitnessRequest, }, raft_serverpb::{MergeState, PeerState, RaftApplyState, RaftTruncatedState, RegionLocalState}, }; use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; -use protobuf::{wire_format::WireType, CodedInputStream}; +use protobuf::{wire_format::WireType, CodedInputStream, Message}; use raft::eraftpb::{ ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, }; use raft_proto::ConfChangeI; +use resource_control::ResourceController; use smallvec::{smallvec, SmallVec}; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -83,7 +84,7 @@ use crate::{ cmd_resp, entry_storage::{self, CachedEntries}, fsm::RaftPollerBuilder, - local_metrics::{RaftMetrics, TimeTracker}, + local_metrics::RaftMetrics, memory::*, metrics::*, msg::{Callback, ErrorCallback, PeerMsg, ReadResponse, SignificantMsg}, @@ -151,6 +152,7 @@ impl HeapSize for PendingCmd {} pub struct PendingCmdQueue { normals: VecDeque>, conf_change: Option>, + compacts: VecDeque>, } impl PendingCmdQueue { @@ -158,6 +160,7 @@ impl PendingCmdQueue { PendingCmdQueue { normals: VecDeque::new(), conf_change: None, + compacts: VecDeque::new(), } } @@ -190,6 +193,23 @@ impl PendingCmdQueue { fn set_conf_change(&mut self, cmd: PendingCmd) { self.conf_change = Some(cmd); } + + fn push_compact(&mut self, cmd: PendingCmd) { + self.compacts.push_back(cmd); + } + + fn pop_compact(&mut self, index: u64) -> Option> { + let mut front = None; + while self.compacts.front().map_or(false, |c| c.index < index) { + front = self.compacts.pop_front(); + front.as_mut().unwrap().cb.take().unwrap(); + } + front + } + + fn has_compact(&mut self) -> bool { + !self.compacts.is_empty() + } } #[derive(Default, Debug)] @@ -233,12 +253,20 @@ impl Range { } } +#[derive(Default, Debug)] +pub struct SwitchWitness { + pub index: u64, + pub switches: Vec, + pub region: Region, +} + #[derive(Debug)] pub enum ExecResult { ChangePeer(ChangePeer), CompactLog { state: RaftTruncatedState, first_index: u64, + has_pending: bool, }, SplitRegion { regions: Vec, @@ -281,6 +309,13 @@ pub enum ExecResult { SetFlashbackState { region: Region, }, + BatchSwitchWitness(SwitchWitness), + // The raftstore thread will use it to update the internal state of `PeerFsm`. If it is + // `true`, when the raftstore detects that the raft log has not been gc for a long time, + // the raftstore thread will actively pull the `voter_replicated_index` from the leader + // and try to compact pending gc. If false, raftstore does not do any additional + // processing. + HasPendingCompactCmd(bool), } /// The possible returned value when applying logs. @@ -455,7 +490,7 @@ where host, importer, region_scheduler, - engine: engine.clone(), + engine, router, notifier, kv_wb, @@ -468,7 +503,7 @@ where committed_count: 0, sync_log_hint: false, use_delete_range: cfg.use_delete_range, - perf_context: engine.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), + perf_context: EK::get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), yield_duration: cfg.apply_yield_duration.0, yield_msg_size: cfg.apply_yield_write_size.0, delete_ssts: vec![], @@ -562,8 +597,7 @@ where .cb_batch .iter() .flat_map(|(cb, _)| cb.write_trackers()) - .flat_map(|trackers| trackers.iter().map(|t| t.as_tracker_token())) - .flatten() + .flat_map(|trackers| trackers.as_tracker_token()) .collect(); self.perf_context.report_metrics(&trackers); self.sync_log_hint = false; @@ -600,7 +634,7 @@ where // Invoke callbacks let now = std::time::Instant::now(); for (cb, resp) in cb_batch.drain(..) { - for tracker in cb.write_trackers().iter().flat_map(|v| *v) { + for tracker in cb.write_trackers() { tracker.observe(now, &self.apply_time, |t| &mut t.metrics.apply_time_nanos); } cb.invoke_with_response(resp); @@ -954,6 +988,9 @@ where /// in same Ready should be applied failed. pending_remove: bool, + /// Indicates whether the peer is waiting data. See more in `Peer`. + wait_data: bool, + /// The commands waiting to be committed and applied pending_cmds: PendingCmdQueue>, /// The counter of pending request snapshots. See more in `Peer`. @@ -1016,6 +1053,7 @@ where peer: find_peer_by_id(®.region, reg.id).unwrap().clone(), region: reg.region, pending_remove: false, + wait_data: false, last_flush_applied_index: reg.apply_state.get_applied_index(), apply_state: reg.apply_state, applied_term: reg.applied_term, @@ -1094,7 +1132,13 @@ where match res { ApplyResult::None => {} - ApplyResult::Res(res) => results.push_back(res), + ApplyResult::Res(res) => { + results.push_back(res); + if self.wait_data { + apply_ctx.committed_count -= committed_entries_drainer.len(); + break; + } + } ApplyResult::Yield | ApplyResult::WaitMergeSource(_) => { // Both cancel and merge will yield current processing. apply_ctx.committed_count -= committed_entries_drainer.len() + 1; @@ -1488,7 +1532,8 @@ where | ExecResult::CompactLog { .. } | ExecResult::DeleteRange { .. } | ExecResult::IngestSst { .. } - | ExecResult::TransferLeader { .. } => {} + | ExecResult::TransferLeader { .. } + | ExecResult::HasPendingCompactCmd(..) => {} ExecResult::SplitRegion { ref derived, .. } => { self.region = derived.clone(); self.metrics.size_diff_hint = 0; @@ -1509,6 +1554,12 @@ where ExecResult::SetFlashbackState { ref region } => { self.region = region.clone(); } + ExecResult::BatchSwitchWitness(ref switches) => { + self.region = switches.region.clone(); + if let Some(p) = find_peer_by_id(&self.region, self.id()) { + self.peer = p.clone(); + } + } } } if let Some(epoch) = origin_epoch { @@ -1545,6 +1596,9 @@ where if let Some(cmd) = self.pending_cmds.conf_change.take() { notify_region_removed(self.region.get_id(), id, cmd); } + for cmd in self.pending_cmds.compacts.drain(..) { + notify_region_removed(self.region.get_id(), id, cmd); + } self.yield_state = None; let mut event = TraceEvent::default(); @@ -1562,6 +1616,9 @@ where if let Some(cmd) = self.pending_cmds.conf_change.take() { notify_stale_command(region_id, peer_id, self.term, cmd); } + for cmd in self.pending_cmds.compacts.drain(..) { + notify_region_removed(self.region.get_id(), peer_id, cmd); + } } fn clear_all_commands_silently(&mut self) { @@ -1571,6 +1628,9 @@ where if let Some(mut cmd) = self.pending_cmds.conf_change.take() { cmd.cb.take(); } + for mut cmd in self.pending_cmds.compacts.drain(..) { + cmd.cb.take(); + } } } @@ -1589,7 +1649,8 @@ where req.get_header().get_region_epoch().get_version() >= self.last_merge_version; check_req_region_epoch(req, &self.region, include_region)?; check_flashback_state( - self.region.get_is_in_flashback(), + self.region.is_in_flashback, + self.region.flashback_start_ts, req, self.region_id(), false, @@ -1634,8 +1695,9 @@ where AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { self.exec_flashback(ctx, request) } - AdminCmdType::BatchSwitchWitness => Err(box_err!("unsupported admin command type")), + AdminCmdType::BatchSwitchWitness => self.exec_batch_switch_witness(ctx, request), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), + AdminCmdType::UpdateGcPeer => unimplemented!(), }?; response.set_cmd_type(cmd_type); @@ -2914,6 +2976,7 @@ where // Modify the region meta in memory. let mut region = self.region.clone(); region.set_is_in_flashback(is_in_flashback); + region.set_flashback_start_ts(req.get_prepare_flashback().get_start_ts()); // Modify the `RegionLocalState` persisted in disk. write_peer_state(ctx.kv_wb_mut(), ®ion, PeerState::Normal, None).unwrap_or_else(|e| { panic!( @@ -2937,13 +3000,83 @@ where )) } + // When the first return value is true, it means that we have updated + // `RaftApplyState`, and the caller needs to do persistence. + fn try_compact_log( + &mut self, + voter_replicated_index: u64, + voter_replicated_term: u64, + ) -> Result<(bool, Option>)> { + PEER_ADMIN_CMD_COUNTER.compact.all.inc(); + let first_index = entry_storage::first_index(&self.apply_state); + + if self.is_merging { + info!( + "in merging mode, skip compact"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "voter_replicated_index" => voter_replicated_index, + ); + return Ok((false, None)); + } + + // When the witness restarted, the pending compact cmd has been lost, so use + // `voter_replicated_index` for gc to avoid log accumulation. + if !self.pending_cmds.has_compact() { + if voter_replicated_index <= first_index { + debug!( + "voter_replicated_index <= first index, no need to compact"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "compact_index" => voter_replicated_index, + "first_index" => first_index, + ); + return Ok((false, Some(ExecResult::HasPendingCompactCmd(false)))); + } + // compact failure is safe to be omitted, no need to assert. + compact_raft_log( + &self.tag, + &mut self.apply_state, + voter_replicated_index, + voter_replicated_term, + )?; + PEER_ADMIN_CMD_COUNTER.compact.success.inc(); + return Ok((true, Some(ExecResult::HasPendingCompactCmd(false)))); + } + + match self.pending_cmds.pop_compact(voter_replicated_index) { + Some(cmd) => { + // compact failure is safe to be omitted, no need to assert. + compact_raft_log(&self.tag, &mut self.apply_state, cmd.index, cmd.term)?; + PEER_ADMIN_CMD_COUNTER.compact.success.inc(); + Ok(( + true, + Some(ExecResult::CompactLog { + state: self.apply_state.get_truncated_state().clone(), + first_index, + has_pending: self.pending_cmds.has_compact(), + }), + )) + } + None => { + info!( + "latest voter_replicated_index < compact_index, skip"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "voter_replicated_index" => voter_replicated_index, + ); + Ok((false, None)) + } + } + } + fn exec_compact_log( &mut self, req: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { PEER_ADMIN_CMD_COUNTER.compact.all.inc(); - let compact_index = req.get_compact_log().get_compact_index(); + let mut compact_index = req.get_compact_log().get_compact_index(); let resp = AdminResponse::default(); let first_index = entry_storage::first_index(&self.apply_state); if compact_index <= first_index { @@ -2966,7 +3099,7 @@ where return Ok((resp, ApplyResult::None)); } - let compact_term = req.get_compact_log().get_compact_term(); + let mut compact_term = req.get_compact_log().get_compact_term(); // TODO: add unit tests to cover all the message integrity checks. if compact_term == 0 { info!( @@ -2981,6 +3114,44 @@ where )); } + let voter_replicated_index = req.get_compact_log().get_voter_replicated_index(); + // If there is any voter lagging behind, the log truncation of the witness + // shouldn't be triggered even if it's force mode(raft log size/count exceeds + // the threshold or raft engine purge), otherwise the witness can't help the + // lagging voter catch up logs when leader is down. In this situation Compact + // index should be queued. If witness receives a voter_replicated_index + // that is larger than the pending compact index, logs can be deleted. + if self.peer.is_witness { + if voter_replicated_index < compact_index { + self.pending_cmds.push_compact(PendingCmd::new( + compact_index, + compact_term, + Callback::None, + )); + match self.pending_cmds.pop_compact(voter_replicated_index) { + Some(cmd) => { + compact_index = cmd.index; + compact_term = cmd.term; + } + None => { + info!( + "voter_replicated_index < compact_index, skip"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "command" => ?req.get_compact_log() + ); + return Ok(( + resp, + ApplyResult::Res(ExecResult::HasPendingCompactCmd(true)), + )); + } + } + } else { + for mut cmd in self.pending_cmds.compacts.drain(..) { + cmd.cb.take().unwrap(); + } + } + } // compact failure is safe to be omitted, no need to assert. compact_raft_log( &self.tag, @@ -2996,6 +3167,7 @@ where ApplyResult::Res(ExecResult::CompactLog { state: self.apply_state.get_truncated_state().clone(), first_index, + has_pending: self.pending_cmds.has_compact(), }), )) } @@ -3058,6 +3230,90 @@ where )) } + fn exec_batch_switch_witness( + &mut self, + ctx: &mut ApplyContext, + request: &AdminRequest, + ) -> Result<(AdminResponse, ApplyResult)> { + assert!(request.has_switch_witnesses()); + let switches = request + .get_switch_witnesses() + .get_switch_witnesses() + .to_vec(); + + info!( + "exec BatchSwitchWitness"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "epoch" => ?self.region.get_region_epoch(), + ); + + let mut region = self.region.clone(); + for s in switches.as_slice() { + PEER_ADMIN_CMD_COUNTER.batch_switch_witness.all.inc(); + let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); + let mut peer_is_exist = false; + for p in region.mut_peers().iter_mut() { + if p.id == peer_id { + if p.is_witness == is_witness { + return Err(box_err!( + "switch peer {:?} on region {:?} is no-op", + p, + self.region + )); + } + p.is_witness = is_witness; + peer_is_exist = true; + break; + } + } + if !peer_is_exist { + return Err(box_err!( + "switch peer {} on region {:?} failed: peer does not exist", + peer_id, + self.region + )); + } + PEER_ADMIN_CMD_COUNTER.batch_switch_witness.success.inc(); + if self.id() == peer_id && !is_witness { + self.wait_data = true; + self.peer.is_witness = false; + } + } + let conf_ver = region.get_region_epoch().get_conf_ver() + switches.len() as u64; + region.mut_region_epoch().set_conf_ver(conf_ver); + info!( + "switch witness successfully"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "switches" => ?switches, + "original region" => ?&self.region, + "current region" => ?®ion, + ); + + let state = if self.pending_remove { + PeerState::Tombstone + } else if self.wait_data { + PeerState::Unavailable + } else { + PeerState::Normal + }; + + if let Err(e) = write_peer_state(ctx.kv_wb_mut(), ®ion, state, None) { + panic!("{} failed to update region state: {:?}", self.tag, e); + } + + let resp = AdminResponse::default(); + Ok(( + resp, + ApplyResult::Res(ExecResult::BatchSwitchWitness(SwitchWitness { + index: ctx.exec_log_index, + switches, + region, + })), + )) + } + fn update_memory_trace(&mut self, event: &mut TraceEvent) { let pending_cmds = self.pending_cmds.heap_size(); let merge_yield = if let Some(ref mut state) = self.yield_state { @@ -3195,16 +3451,12 @@ impl Apply { pub fn on_schedule(&mut self, metrics: &RaftMetrics) { let now = std::time::Instant::now(); for cb in &mut self.cbs { - if let Some(trackers) = cb.cb.write_trackers_mut() { - for tracker in trackers { - tracker.observe(now, &metrics.store_time, |t| { - t.metrics.write_instant = Some(now); - &mut t.metrics.store_time_nanos - }); - if let TimeTracker::Instant(t) = tracker { - *t = now; - } - } + for tracker in cb.cb.write_trackers_mut() { + tracker.observe(now, &metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + tracker.reset(now); } } } @@ -3272,6 +3524,7 @@ pub struct Proposal { /// lease. pub propose_time: Option, pub must_pass_epoch_check: bool, + pub sent: bool, } impl Proposal { @@ -3283,6 +3536,7 @@ impl Proposal { propose_time: None, must_pass_epoch_check: false, is_conf_change: false, + sent: false, } } } @@ -3451,6 +3705,32 @@ where #[cfg(any(test, feature = "testexport"))] #[allow(clippy::type_complexity)] Validate(u64, Box), + Recover(u64), + CheckCompact { + region_id: u64, + voter_replicated_index: u64, + voter_replicated_term: u64, + }, +} + +impl ResourceMetered for Msg { + fn get_resource_consumptions(&self) -> Option> { + match self { + Msg::Apply { apply, .. } => { + let mut map = HashMap::default(); + for cached_entries in &apply.entries { + cached_entries.iter_entries(|entry| { + // TODO: maybe use a more efficient way to get the resource group name. + let header = util::get_entry_header(entry); + let group_name = header.get_resource_group_name().to_owned(); + *map.entry(group_name).or_default() += entry.compute_size() as u64; + }); + } + Some(map) + } + _ => None, + } + } } impl Msg @@ -3498,6 +3778,18 @@ where } => write!(f, "[region {}] change cmd", region_id), #[cfg(any(test, feature = "testexport"))] Msg::Validate(region_id, _) => write!(f, "[region {}] validate", region_id), + Msg::Recover(region_id) => write!(f, "recover [region {}] apply", region_id), + Msg::CheckCompact { + region_id, + voter_replicated_index, + voter_replicated_term, + } => { + write!( + f, + "[region {}] check compact, voter_replicated_index: {}, voter_replicated_term: {}", + region_id, voter_replicated_index, voter_replicated_term + ) + } } } } @@ -3612,6 +3904,10 @@ where return; } + if self.delegate.wait_data { + return; + } + let mut entries = Vec::new(); let mut dangle_size = 0; @@ -3814,8 +4110,9 @@ where if self.delegate.pending_remove || self.delegate.stopped { return; } - if self.delegate.peer.is_witness { - // witness shouldn't generate snapshot. + if self.delegate.peer.is_witness || self.delegate.wait_data { + // witness or non-witness hasn't finish applying snapshot shouldn't generate + // snapshot. return; } let applied_index = self.delegate.apply_state.get_applied_index(); @@ -3947,6 +4244,45 @@ where cb.invoke_read(resp); } + fn check_pending_compact_log( + &mut self, + ctx: &mut ApplyContext, + voter_replicated_index: u64, + voter_replicated_term: u64, + ) { + if self.delegate.pending_remove || self.delegate.stopped { + return; + } + + let res = self + .delegate + .try_compact_log(voter_replicated_index, voter_replicated_term); + match res { + Ok((should_write, res)) => { + if let Some(res) = res { + if ctx.timer.is_none() { + ctx.timer = Some(Instant::now_coarse()); + } + ctx.prepare_for(&mut self.delegate); + let mut result = VecDeque::new(); + // If modified `truncated_state` in `try_compact_log`, the apply state should be + // persisted. + if should_write { + self.delegate.write_apply_state(ctx.kv_wb_mut()); + ctx.commit_opt(&mut self.delegate, true); + } + result.push_back(res); + ctx.finish_for(&mut self.delegate, result); + } + } + Err(e) => error!(?e; + "failed to compact log"; + "region_id" => self.delegate.region.get_id(), + "peer_id" => self.delegate.id(), + ), + } + } + fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>) { let mut drainer = msgs.drain(..); let mut batch_apply = None; @@ -3983,7 +4319,7 @@ where .cbs .iter() .flat_map(|p| p.cb.write_trackers()) - .flat_map(|ts| ts.iter().flat_map(|t| t.as_tracker_token())) + .flat_map(|ts| ts.as_tracker_token()) { GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.apply_wait_nanos = apply_wait.as_nanos() as u64; @@ -4002,8 +4338,11 @@ where } } } - batch_apply = Some(apply); + if !self.delegate.wait_data { + batch_apply = Some(apply); + } } + Msg::Recover(..) => self.delegate.wait_data = false, Msg::Registration(reg) => self.handle_registration(reg), Msg::Destroy(d) => self.handle_destroy(apply_ctx, d), Msg::LogsUpToDate(cul) => self.logs_up_to_date_for_merge(apply_ctx, cul), @@ -4019,6 +4358,17 @@ where let delegate = &self.delegate as *const ApplyDelegate as *const u8; f(delegate) } + Msg::CheckCompact { + voter_replicated_index, + voter_replicated_term, + .. + } => { + self.check_pending_compact_log( + apply_ctx, + voter_replicated_index, + voter_replicated_term, + ); + } } } } @@ -4080,6 +4430,7 @@ pub enum ControlMsg { }, } +impl ResourceMetered for ControlMsg {} pub struct ControlFsm { receiver: Receiver, stopped: bool, @@ -4429,6 +4780,16 @@ where } #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => return, + Msg::Recover(region_id) => { + info!("recover apply"; + "region_id" => region_id); + return; + } + Msg::CheckCompact { region_id, .. } => { + info!("target region is not found"; + "region_id" => region_id); + return; + } }, Either::Left(Err(TrySendError::Full(_))) => unreachable!(), }; @@ -4498,10 +4859,15 @@ impl ApplyBatchSystem { pub fn create_apply_batch_system( cfg: &Config, + resource_ctl: Option>, ) -> (ApplyRouter, ApplyBatchSystem) { let (control_tx, control_fsm) = ControlFsm::new(); - let (router, system) = - batch_system::create_system(&cfg.apply_batch_system, control_tx, control_fsm); + let (router, system) = batch_system::create_system( + &cfg.apply_batch_system, + control_tx, + control_fsm, + resource_ctl, + ); (ApplyRouter { router }, ApplyBatchSystem { system }) } @@ -4561,6 +4927,8 @@ mod memtrace { | Msg::Change { .. } => 0, #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => 0, + Msg::Recover(..) => 0, + Msg::CheckCompact { .. } => 0, } } } @@ -4646,6 +5014,7 @@ mod tests { cmd.mut_put().set_key(b"key".to_vec()); cmd.mut_put().set_value(b"value".to_vec()); let mut req = RaftCmdRequest::default(); + req.set_header(RaftRequestHeader::default()); req.mut_requests().push(cmd); e.set_data(req.write_to_bytes().unwrap().into()) } @@ -4878,6 +5247,7 @@ mod tests { cb, propose_time: None, must_pass_epoch_check: false, + sent: true, } } @@ -4912,7 +5282,7 @@ mod tests { let (_dir, importer) = create_tmp_importer("apply-basic"); let (region_scheduler, mut snapshot_rx) = dummy_scheduler(); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5376,7 +5746,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5715,7 +6085,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5806,7 +6176,7 @@ mod tests { cfg.apply_batch_system.low_priority_pool_size = 0; Arc::new(VersionTrack::new(cfg)) }; - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5986,7 +6356,7 @@ mod tests { cfg.apply_batch_system.low_priority_pool_size = 0; Arc::new(VersionTrack::new(cfg)) }; - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6079,7 +6449,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Config::default(); - let (router, mut system) = create_apply_batch_system(&cfg); + let (router, mut system) = create_apply_batch_system(&cfg, None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-exec-observer".to_owned(), @@ -6303,7 +6673,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Config::default(); - let (router, mut system) = create_apply_batch_system(&cfg); + let (router, mut system) = create_apply_batch_system(&cfg, None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6583,7 +6953,7 @@ mod tests { .register_cmd_observer(1, BoxCmdObserver::new(obs)); let (region_scheduler, _) = dummy_scheduler(); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6809,7 +7179,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "flashback_need_to_be_applied".to_owned(), diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index 2f700eec9bf..b481caf4f74 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -14,7 +14,7 @@ pub use self::{ check_sst_for_ingestion, create_apply_batch_system, Apply, ApplyBatchSystem, ApplyMetrics, ApplyRes, ApplyRouter, Builder as ApplyPollerBuilder, CatchUpLogs, ChangeObserver, ChangePeer, ExecResult, GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, - Registration, TaskRes as ApplyTaskRes, + Registration, SwitchWitness, TaskRes as ApplyTaskRes, }, peer::{new_admin_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, store::{ diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 1b484df5316..75da7d497e4 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -53,9 +53,9 @@ use tikv_alloc::trace::TraceEvent; use tikv_util::{ box_err, debug, defer, error, escape, info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, - store::{find_peer, is_learner, region_on_same_stores}, + store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, + time::{monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, Either, @@ -75,7 +75,7 @@ use crate::{ apply, store::{PollContext, StoreMeta}, ApplyMetrics, ApplyTask, ApplyTaskRes, CatchUpLogs, ChangeObserver, ChangePeer, - ExecResult, + ExecResult, SwitchWitness, }, hibernate_state::{GroupState, HibernateState}, local_metrics::{RaftMetrics, TimeTracker}, @@ -247,6 +247,7 @@ where raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, + wait_data: bool, ) -> Result> { let meta_peer = match find_peer(region, store_id) { None => { @@ -277,6 +278,7 @@ where engines, region, meta_peer, + wait_data, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -331,6 +333,7 @@ where engines, ®ion, peer, + false, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -524,13 +527,14 @@ where })) }; - let tokens: SmallVec<[TimeTracker; 4]> = cbs + let trackers: SmallVec<[TimeTracker; 4]> = cbs .iter_mut() - .filter_map(|cb| cb.write_trackers().map(|t| t[0])) + .flat_map(|cb| cb.write_trackers()) + .cloned() .collect(); - let mut cb = Callback::write_ext( - Box::new(move |resp| { + let cb = Callback::Write { + cb: Box::new(move |resp| { for cb in cbs { let mut cmd_resp = RaftCmdResponse::default(); cmd_resp.set_header(resp.response.get_header().clone()); @@ -539,12 +543,8 @@ where }), proposed_cb, committed_cb, - ); - - if let Some(trackers) = cb.write_trackers_mut() { - *trackers = tokens; - } - + trackers, + }; return Some((req, cb)); } None @@ -610,6 +610,9 @@ where for m in msgs.drain(..) { match m { PeerMsg::RaftMessage(msg) => { + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; @@ -625,7 +628,7 @@ where .propose_wait_time .observe(propose_time.as_secs_f64()); cmd.callback.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_propose_wait_nanos = propose_time.as_nanos() as u64; }) @@ -694,7 +697,7 @@ where .raft_metrics .event_time .peer_msg - .observe(duration_to_sec(timer.saturating_elapsed())); + .observe(timer.saturating_elapsed_secs()); } #[inline] @@ -1195,6 +1198,8 @@ where PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted_tick(), PeerTick::CheckPeersAvailability => self.on_check_peers_availability(), + PeerTick::RequestSnapshot => self.on_request_snapshot_tick(), + PeerTick::RequestVoterReplicatedIndex => self.on_request_voter_replicated_index(), } } @@ -1205,6 +1210,9 @@ where self.register_split_region_check_tick(); self.register_check_peer_stale_state_tick(); self.on_check_merge(); + if self.fsm.peer.wait_data { + self.on_request_snapshot_tick(); + } // Apply committed entries more quickly. // Or if it's a leader. This implicitly means it's a singleton // because it becomes leader in `Peer::new` when it's a @@ -1217,6 +1225,9 @@ where self.fsm.has_ready = true; } self.fsm.peer.maybe_gen_approximate_buckets(self.ctx); + if self.fsm.peer.is_witness() { + self.register_pull_voter_replicated_index_tick(); + } } fn on_gc_snap(&mut self, snaps: Vec<(SnapKey, bool)>) { @@ -1950,6 +1961,7 @@ where self.register_raft_gc_log_tick(); self.register_check_leader_lease_tick(); self.register_report_region_buckets_tick(); + self.register_check_peers_availability_tick(); } if let Some(ForceLeaderState::ForceLeader { .. }) = self.fsm.peer.force_leader { @@ -2160,12 +2172,6 @@ where return; } - // Keep ticking if there are disk full peers for the Region. - if !self.fsm.peer.disk_full_peers.is_empty() { - self.register_raft_base_tick(); - return; - } - debug!("stop ticking"; "res" => ?res, "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), @@ -2257,6 +2263,9 @@ where "peer_id" => self.fsm.peer_id(), "res" => ?res, ); + if self.fsm.peer.wait_data { + return; + } self.on_ready_result(&mut res.exec_res, &res.metrics); if self.fsm.stopped { return; @@ -2466,6 +2475,17 @@ where return Ok(()); } + if MessageType::MsgAppend == msg_type + && self.fsm.peer.wait_data + && self.fsm.peer.should_reject_msgappend + { + debug!("skip {:?} because of non-witness waiting data", msg_type; + "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id() + ); + self.ctx.raft_metrics.message_dropped.non_witness.inc(); + return Ok(()); + } + if !self.validate_raft_msg(&msg) { return Ok(()); } @@ -2602,6 +2622,7 @@ where fn on_hibernate_request(&mut self, from: &metapb::Peer) { if !self.ctx.cfg.hibernate_regions || self.fsm.peer.has_uncommitted_log() + || self.fsm.peer.wait_data || from.get_id() != self.fsm.peer.leader_id() { // Ignore the message means rejecting implicitly. @@ -2667,6 +2688,53 @@ where ); } + fn on_voter_replicated_index_request(&mut self, from: &metapb::Peer) { + if !self.fsm.peer.is_leader() { + return; + } + let mut voter_replicated_idx = self.fsm.peer.get_store().last_index(); + for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { + let peer = find_peer_by_id(self.region(), *peer_id).unwrap(); + if voter_replicated_idx > p.matched && !is_learner(peer) { + voter_replicated_idx = p.matched; + } + } + let first_index = self.fsm.peer.get_store().first_index(); + if voter_replicated_idx > first_index { + voter_replicated_idx = first_index; + } + let mut resp = ExtraMessage::default(); + resp.set_type(ExtraMessageType::MsgVoterReplicatedIndexResponse); + resp.index = voter_replicated_idx; + self.fsm + .peer + .send_extra_message(resp, &mut self.ctx.trans, from); + debug!( + "leader responses voter_replicated_index to witness"; + "region_id" => self.region().get_id(), + "witness_id" => from.id, + "leader_id" => self.fsm.peer.peer.get_id(), + "voter_replicated_index" => voter_replicated_idx, + ); + } + + fn on_voter_replicated_index_response(&mut self, msg: &ExtraMessage) { + if self.fsm.peer.is_leader() || !self.fsm.peer.is_witness() { + return; + } + let voter_replicated_index = msg.index; + if let Ok(voter_replicated_term) = self.fsm.peer.get_store().term(voter_replicated_index) { + self.ctx.apply_router.schedule_task( + self.region_id(), + ApplyTask::CheckCompact { + region_id: self.region_id(), + voter_replicated_index, + voter_replicated_term, + }, + ) + } + } + fn on_extra_message(&mut self, mut msg: RaftMessage) { match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { @@ -2716,6 +2784,14 @@ where ExtraMessageType::MsgAvailabilityResponse => { self.on_availability_response(msg.get_from_peer(), msg.get_extra_msg()); } + ExtraMessageType::MsgVoterReplicatedIndexRequest => { + self.on_voter_replicated_index_request(msg.get_from_peer()); + } + ExtraMessageType::MsgVoterReplicatedIndexResponse => { + self.on_voter_replicated_index_response(msg.get_extra_msg()); + } + ExtraMessageType::MsgGcPeerRequest => unimplemented!(), + ExtraMessageType::MsgGcPeerResponse => unimplemented!(), } } @@ -2999,7 +3075,7 @@ where if snap.get_metadata().get_index() < self.fsm.peer.get_store().applied_index() && snap_data.get_meta().get_for_witness() != self.fsm.peer.is_witness() { - info!( + error!( "mismatch witness snapshot"; "region_id" => region_id, "peer_id" => self.fsm.peer_id(), @@ -3301,7 +3377,6 @@ where ); } else { self.fsm.peer.transfer_leader(&from); - self.fsm.peer.wait_data_peers.clear(); } } } @@ -3871,6 +3946,9 @@ where self.fsm.peer.schedule_raftlog_gc(self.ctx, compact_to); self.fsm.peer.last_compacted_idx = compact_to; self.fsm.peer.mut_store().on_compact_raftlog(compact_to); + if self.fsm.peer.is_witness() { + self.fsm.peer.last_compacted_time = Instant::now(); + } } fn on_ready_split_region( @@ -4012,6 +4090,7 @@ where self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), &new_region, + false, ) { Ok((sender, new_peer)) => (sender, new_peer), Err(e) => { @@ -4861,8 +4940,13 @@ where while let Some(result) = exec_results.pop_front() { match result { ExecResult::ChangePeer(cp) => self.on_ready_change_peer(cp), - ExecResult::CompactLog { first_index, state } => { - self.on_ready_compact_log(first_index, state) + ExecResult::CompactLog { + state, + first_index, + has_pending, + } => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + self.on_ready_compact_log(first_index, state); } ExecResult::SplitRegion { derived, @@ -4897,6 +4981,15 @@ where ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), + ExecResult::BatchSwitchWitness(switches) => { + self.on_ready_batch_switch_witness(switches) + } + ExecResult::HasPendingCompactCmd(has_pending) => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + if has_pending { + self.register_pull_voter_replicated_index_tick(); + } + } } } @@ -5058,8 +5151,29 @@ where && msg.get_admin_request().get_cmd_type() == AdminCmdType::TransferLeader) { self.ctx.raft_metrics.invalid_proposal.witness.inc(); - // TODO: use a dedicated error type - return Err(Error::RecoveryInProgress(self.region_id())); + return Err(Error::IsWitness(self.region_id())); + } + + // Forbid requests to switch it into a witness when it's a leader + if self.fsm.peer.is_leader() + && msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::BatchSwitchWitness + && msg + .get_admin_request() + .get_switch_witnesses() + .get_switch_witnesses() + .iter() + .any(|s| s.get_peer_id() == self.fsm.peer.peer.get_id() && s.get_is_witness()) + { + self.ctx.raft_metrics.invalid_proposal.witness.inc(); + return Err(Error::IsWitness(self.region_id())); + } + + // Forbid requests when it becomes to non-witness but not finish applying + // snapshot. + if self.fsm.peer.wait_data { + self.ctx.raft_metrics.invalid_proposal.non_witness.inc(); + return Err(Error::IsWitness(self.region_id())); } // check whether the peer is initialized. @@ -5112,9 +5226,13 @@ where // the apply phase and because a read-only request doesn't need to be applied, // so it will be allowed during the flashback progress, for example, a snapshot // request. - if let Err(e) = - util::check_flashback_state(self.region().is_in_flashback, msg, region_id, true) - { + if let Err(e) = util::check_flashback_state( + self.region().is_in_flashback, + self.region().flashback_start_ts, + msg, + region_id, + true, + ) { match e { Error::FlashbackInProgress(_) => self .ctx @@ -5166,7 +5284,7 @@ where if self.ctx.raft_metrics.waterfall_metrics { let now = Instant::now(); - for tracker in cb.write_trackers().iter().flat_map(|v| *v) { + for tracker in cb.write_trackers() { tracker.observe(now, &self.ctx.raft_metrics.wf_batch_wait, |t| { &mut t.metrics.wf_batch_wait_nanos }); @@ -5315,8 +5433,13 @@ where let first_idx = self.fsm.peer.get_store().first_index(); let last_idx = self.fsm.peer.get_store().last_index(); + let mut voter_replicated_idx = last_idx; let (mut replicated_idx, mut alive_cache_idx) = (last_idx, last_idx); for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { + let peer = find_peer_by_id(self.region(), *peer_id).unwrap(); + if !is_learner(peer) && voter_replicated_idx > p.matched { + voter_replicated_idx = p.matched; + } if replicated_idx > p.matched { replicated_idx = p.matched; } @@ -5405,7 +5528,8 @@ where let region_id = self.fsm.peer.region().get_id(); let peer = self.fsm.peer.peer.clone(); let term = self.fsm.peer.get_index_term(compact_idx); - let request = new_compact_log_request(region_id, peer, compact_idx, term); + let request = + new_compact_log_request(region_id, peer, compact_idx, term, voter_replicated_idx); self.propose_raft_command_internal( request, Callback::None, @@ -5444,6 +5568,56 @@ where self.register_check_long_uncommitted_tick(); } + fn on_request_snapshot_tick(&mut self) { + fail_point!("ignore request snapshot", |_| { + self.schedule_tick(PeerTick::RequestSnapshot); + }); + if !self.fsm.peer.wait_data || self.fsm.peer.is_leader() { + return; + } + self.fsm.peer.request_index = self.fsm.peer.raft_group.raft.raft_log.last_index(); + let last_term = self.fsm.peer.get_index_term(self.fsm.peer.request_index); + if last_term == self.fsm.peer.term() { + self.fsm.peer.should_reject_msgappend = true; + if let Err(e) = self.fsm.peer.raft_group.request_snapshot() { + error!( + "failed to request snapshot"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => %e, + ); + } + } else { + // If a leader change occurs after switch to non-witness, it should be + // continue processing `MsgAppend` until `last_term == term`, then retry + // to request snapshot. + self.fsm.peer.should_reject_msgappend = false; + } + // Requesting a snapshot may fail, so register a periodic event as a defense + // until succeeded. + self.schedule_tick(PeerTick::RequestSnapshot); + } + + fn on_request_voter_replicated_index(&mut self) { + if !self.fsm.peer.is_witness() || !self.fsm.peer.has_pending_compact_cmd { + return; + } + if self.fsm.peer.last_compacted_time.elapsed() + > self.ctx.cfg.request_voter_replicated_index_interval.0 + { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgVoterReplicatedIndexRequest); + let leader_id = self.fsm.peer.leader_id(); + let leader = self.fsm.peer.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &leader); + } + } + self.register_pull_voter_replicated_index_tick(); + } + fn register_check_leader_lease_tick(&mut self) { self.schedule_tick(PeerTick::CheckLeaderLease) } @@ -5965,18 +6139,35 @@ where } fn on_check_peers_availability(&mut self) { + let mut invalid_peers: Vec = Vec::new(); for peer_id in self.fsm.peer.wait_data_peers.iter() { - let peer = self.fsm.peer.get_peer_from_cache(*peer_id).unwrap(); - let mut msg = ExtraMessage::default(); - msg.set_type(ExtraMessageType::MsgAvailabilityRequest); - self.fsm - .peer - .send_extra_message(msg, &mut self.ctx.trans, &peer); - debug!( - "check peer availability"; - "target peer id" => *peer_id, - ); + match self.fsm.peer.get_peer_from_cache(*peer_id) { + Some(peer) => { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityRequest); + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &peer); + debug!( + "check peer availability"; + "target peer id" => *peer_id, + ); + } + None => invalid_peers.push(*peer_id), + } } + // For some reasons, the peer corresponding to the previously saved peer_id + // no longer exists. In order to avoid passing invalid information to pd when + // reporting pending peers and affecting pd scheduling, remove it from the + // `wait_data_peers`. + self.fsm + .peer + .wait_data_peers + .retain(|peer_id| !invalid_peers.contains(peer_id)); + } + + fn register_pull_voter_replicated_index_tick(&mut self) { + self.schedule_tick(PeerTick::RequestVoterReplicatedIndex); } fn on_check_peer_stale_state_tick(&mut self) { @@ -6257,6 +6448,50 @@ where self.fsm.peer.leader_lease_mut().expire_remote_lease(); } + fn on_ready_batch_switch_witness(&mut self, sw: SwitchWitness) { + { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.set_region( + &self.ctx.coprocessor_host, + sw.region, + &mut self.fsm.peer, + RegionChangeReason::SwitchWitness, + ); + } + for s in sw.switches { + let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); + if self.fsm.peer_id() == peer_id { + if is_witness && !self.fsm.peer.is_leader() { + let _ = self.fsm.peer.get_store().clear_data(); + self.fsm.peer.raft_group.set_priority(-1); + } else { + self.fsm + .peer + .update_read_progress(self.ctx, ReadProgress::WaitData(true)); + self.fsm.peer.wait_data = true; + self.on_request_snapshot_tick(); + } + self.fsm.peer.peer.is_witness = is_witness; + continue; + } + if !is_witness && !self.fsm.peer.wait_data_peers.contains(&peer_id) { + self.fsm.peer.wait_data_peers.push(peer_id); + } + } + if self.fsm.peer.is_leader() { + info!( + "notify pd with change peer region"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "region" => ?self.fsm.peer.region(), + ); + self.fsm.peer.heartbeat_pd(self.ctx); + if !self.fsm.peer.wait_data_peers.is_empty() { + self.register_check_peers_availability_tick(); + } + } + } + /// Verify and store the hash to state. return true means the hash has been /// stored successfully. // TODO: Consider context in the function. @@ -6421,6 +6656,7 @@ fn new_compact_log_request( peer: metapb::Peer, compact_index: u64, compact_term: u64, + voter_replicated_index: u64, ) -> RaftCmdRequest { let mut request = new_admin_request(region_id, peer); @@ -6428,6 +6664,9 @@ fn new_compact_log_request( admin.set_cmd_type(AdminCmdType::CompactLog); admin.mut_compact_log().set_compact_index(compact_index); admin.mut_compact_log().set_compact_term(compact_term); + admin + .mut_compact_log() + .set_voter_replicated_index(voter_replicated_index); request.set_admin_request(admin); request } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 54bb7d0cc0b..85631bebe09 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -42,6 +42,7 @@ use kvproto::{ use pd_client::{Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; +use resource_control::ResourceGroupManager; use resource_metering::CollectorRegHandle; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -594,6 +595,11 @@ where self.cfg.check_long_uncommitted_interval.0; self.tick_batch[PeerTick::CheckPeersAvailability as usize].wait_duration = self.cfg.check_peers_availability_interval.0; + self.tick_batch[PeerTick::RequestSnapshot as usize].wait_duration = + self.cfg.check_request_snapshot_interval.0; + // TODO: make it reasonable + self.tick_batch[PeerTick::RequestVoterReplicatedIndex as usize].wait_duration = + self.cfg.raft_log_gc_tick_interval.0 * 2; } } @@ -747,6 +753,9 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> match m { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { if matches!(&e, Error::RegionNotRegistered { .. }) { // This may happen in normal cases when add-peer runs slowly @@ -800,7 +809,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> .raft_metrics .event_time .store_msg - .observe(duration_to_sec(timer.saturating_elapsed())); + .observe(timer.saturating_elapsed_secs()); } fn start(&mut self, store: metapb::Store) { @@ -1203,6 +1212,7 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), region, + local_state.get_state() == PeerState::Unavailable, )); peer.peer.init_replication_mode(&mut replication_state); if local_state.get_state() == PeerState::Merging { @@ -1243,6 +1253,7 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), ®ion, + false, )?; peer.peer.init_replication_mode(&mut replication_state); peer.schedule_applying_snapshot(); @@ -1371,14 +1382,14 @@ where ready_count: 0, has_ready: false, current_time: None, - raft_perf_context: self - .engines - .raft - .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), - kv_perf_context: self - .engines - .kv - .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), + raft_perf_context: ER::get_perf_context( + self.cfg.value().perf_level, + PerfContextKind::RaftstoreStore, + ), + kv_perf_context: EK::get_perf_context( + self.cfg.value().perf_level, + PerfContextKind::RaftstoreStore, + ), tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], node_start_time: Some(TiInstant::now_coarse()), feature_gate: self.feature_gate.clone(), @@ -1513,7 +1524,9 @@ impl RaftBatchSystem { ) -> Result<()> { assert!(self.workers.is_none()); // TODO: we can get cluster meta regularly too later. - let purge_worker = if engines.raft.need_manual_purge() { + let purge_worker = if engines.raft.need_manual_purge() + && !cfg.value().raft_engine_purge_interval.0.is_zero() + { let worker = Worker::new("purge-worker"); let raft_clone = engines.raft.clone(); let router_clone = self.router(); @@ -1732,7 +1745,6 @@ impl RaftBatchSystem { Arc::clone(&pd_client), self.router.clone(), workers.pd_worker.scheduler(), - cfg.pd_store_heartbeat_tick_interval.0, auto_split_controller, concurrency_manager, snap_mgr, @@ -1787,11 +1799,21 @@ impl RaftBatchSystem { pub fn create_raft_batch_system( cfg: &Config, + resource_manager: &Option>, ) -> (RaftRouter, RaftBatchSystem) { let (store_tx, store_fsm) = StoreFsm::new(cfg); - let (apply_router, apply_system) = create_apply_batch_system(cfg); - let (router, system) = - batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); + let (apply_router, apply_system) = create_apply_batch_system( + cfg, + resource_manager + .as_ref() + .map(|m| m.derive_controller("apply".to_owned(), false)), + ); + let (router, system) = batch_system::create_system( + &cfg.store_batch_system, + store_tx, + store_fsm, + None, // Do not do priority scheduling for store batch system + ); let raft_router = RaftRouter { router }; let system = RaftBatchSystem { system, @@ -2907,6 +2929,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), ®ion, + false, ) { Ok((sender, peer)) => (sender, peer), Err(e) => { diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 5cfbb645612..0e6a09cbf0b 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -7,7 +7,7 @@ use collections::HashSet; use prometheus::local::LocalHistogram; use raft::eraftpb::MessageType; use tikv_util::time::{Duration, Instant}; -use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; +use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; use super::metrics::*; @@ -208,47 +208,60 @@ impl StoreWriteMetrics { /// Tracker for the durations of a raftstore request. /// If a global tracker is not available, it will fallback to an Instant. #[derive(Debug, Clone, Copy)] -pub enum TimeTracker { - Tracker(TrackerToken), - Instant(std::time::Instant), +pub struct TimeTracker { + token: TrackerToken, + start: std::time::Instant, +} + +impl Default for TimeTracker { + #[inline] + fn default() -> Self { + let token = tracker::get_tls_tracker_token(); + let start = std::time::Instant::now(); + let tracker = TimeTracker { token, start }; + if token == INVALID_TRACKER_TOKEN { + return tracker; + } + + GLOBAL_TRACKERS.with_tracker(token, |tracker| { + tracker.metrics.write_instant = Some(start); + }); + tracker + } } impl TimeTracker { + #[inline] pub fn as_tracker_token(&self) -> Option { - match self { - TimeTracker::Tracker(tt) => Some(*tt), - TimeTracker::Instant(_) => None, + if self.token == INVALID_TRACKER_TOKEN { + None + } else { + Some(self.token) } } + #[inline] pub fn observe( &self, now: std::time::Instant, local_metric: &LocalHistogram, tracker_metric: impl FnOnce(&mut Tracker) -> &mut u64, ) { - match self { - TimeTracker::Tracker(t) => { - if let Some(dur) = GLOBAL_TRACKERS - .with_tracker(*t, |tracker| { - tracker.metrics.write_instant.map(|write_instant| { - let dur = now.saturating_duration_since(write_instant); - let metric = tracker_metric(tracker); - if *metric == 0 { - *metric = dur.as_nanos() as u64; - } - dur - }) - }) - .flatten() - { - local_metric.observe(dur.as_secs_f64()); - } - } - TimeTracker::Instant(t) => { - let dur = now.saturating_duration_since(*t); - local_metric.observe(dur.as_secs_f64()); - } + let dur = now.saturating_duration_since(self.start); + local_metric.observe(dur.as_secs_f64()); + if self.token == INVALID_TRACKER_TOKEN { + return; } + GLOBAL_TRACKERS.with_tracker(self.token, |tracker| { + let metric = tracker_metric(tracker); + if *metric == 0 { + *metric = dur.as_nanos() as u64; + } + }); + } + + #[inline] + pub fn reset(&mut self, start: std::time::Instant) { + self.start = start; } } diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index b0f44c30c0f..6c6357d286c 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -35,7 +35,8 @@ make_auto_flush_static_metric! { compact, transfer_leader, prepare_flashback, - finish_flashback + finish_flashback, + batch_switch_witness : "batch-switch-witness", } pub label_enum AdminCmdStatus { @@ -177,6 +178,8 @@ make_static_metric! { region_nonexistent, applying_snap, disk_full, + non_witness, + recovery, } pub label_enum ProposalType { @@ -204,7 +207,8 @@ make_static_metric! { force_leader, witness, flashback_in_progress, - flashback_not_prepared + flashback_not_prepared, + non_witness, } pub label_enum RaftEventDurationType { diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 62561c63cbc..fe3c12427bd 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -31,7 +31,10 @@ pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ read::{AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask}, - write::{PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask}, + write::{ + write_to_db_for_test, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, + WriteTask, + }, write_router::{WriteRouter, WriteRouterContext, WriteSenders}, }, bootstrap::{ @@ -76,9 +79,10 @@ pub use self::{ worker::{ metrics as worker_metrics, AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, - LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, ReadExecutor, - ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, - SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, - TrackVer, WriteStats, + LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }, }; diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index a4c6c435741..195a94478dc 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use std::{borrow::Cow, fmt}; +use batch_system::ResourceMetered; use collections::HashSet; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; @@ -24,7 +25,7 @@ use pd_client::BucketMeta; use raft::SnapshotStatus; use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; -use tracker::{get_tls_tracker_token, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; +use tracker::{get_tls_tracker_token, TrackerToken}; use super::{local_metrics::TimeTracker, region_meta::RegionMeta, FetchedLogs, RegionSnapshot}; use crate::store::{ @@ -137,16 +138,7 @@ where proposed_cb: Option, committed_cb: Option, ) -> Self { - let tracker_token = get_tls_tracker_token(); - let now = std::time::Instant::now(); - let tracker = if tracker_token == INVALID_TRACKER_TOKEN { - TimeTracker::Instant(now) - } else { - GLOBAL_TRACKERS.with_tracker(tracker_token, |tracker| { - tracker.metrics.write_instant = Some(now); - }); - TimeTracker::Tracker(tracker_token) - }; + let tracker = TimeTracker::default(); Callback::Write { cb, @@ -217,7 +209,7 @@ pub trait ReadCallback: ErrorCallback { type Response; fn set_result(self, result: Self::Response); - fn read_tracker(&self) -> Option<&TrackerToken>; + fn read_tracker(&self) -> Option; } pub trait WriteCallback: ErrorCallback { @@ -225,8 +217,16 @@ pub trait WriteCallback: ErrorCallback { fn notify_proposed(&mut self); fn notify_committed(&mut self); - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>>; - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>>; + + type TimeTrackerListRef<'a>: IntoIterator + where + Self: 'a; + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_>; + + type TimeTrackerListMut<'a>: IntoIterator + where + Self: 'a; + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_>; fn set_result(self, result: Self::Response); } @@ -257,9 +257,9 @@ impl ReadCallback for Callback { self.invoke_read(result); } - fn read_tracker(&self) -> Option<&TrackerToken> { + fn read_tracker(&self) -> Option { let Callback::Read { tracker, .. } = self else { return None; }; - Some(tracker) + Some(*tracker) } } @@ -276,16 +276,24 @@ impl WriteCallback for Callback { self.invoke_committed(); } + type TimeTrackerListRef<'a> = impl IntoIterator; #[inline] - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - let Callback::Write { trackers, .. } = self else { return None; }; - Some(trackers) + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + let trackers = match self { + Callback::Write { trackers, .. } => Some(trackers), + _ => None, + }; + trackers.into_iter().flatten() } + type TimeTrackerListMut<'a> = impl IntoIterator; #[inline] - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - let Callback::Write { trackers, .. } = self else { return None; }; - Some(trackers) + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + let trackers = match self { + Callback::Write { trackers, .. } => Some(trackers), + _ => None, + }; + trackers.into_iter().flatten() } #[inline] @@ -296,7 +304,7 @@ impl WriteCallback for Callback { impl WriteCallback for Vec where - C: WriteCallback, + C: WriteCallback + 'static, C::Response: Clone, { type Response = C::Response; @@ -315,14 +323,16 @@ where } } + type TimeTrackerListRef<'a> = impl Iterator + 'a; #[inline] - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - None + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + self.iter().flat_map(|c| c.write_trackers()) } + type TimeTrackerListMut<'a> = impl Iterator + 'a; #[inline] - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - None + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + self.iter_mut().flat_map(|c| c.write_trackers_mut()) } #[inline] @@ -375,6 +385,8 @@ pub enum PeerTick { ReportBuckets = 9, CheckLongUncommitted = 10, CheckPeersAvailability = 11, + RequestSnapshot = 12, + RequestVoterReplicatedIndex = 13, } impl PeerTick { @@ -395,6 +407,8 @@ impl PeerTick { PeerTick::ReportBuckets => "report_buckets", PeerTick::CheckLongUncommitted => "check_long_uncommitted", PeerTick::CheckPeersAvailability => "check_peers_availability", + PeerTick::RequestSnapshot => "request_snapshot", + PeerTick::RequestVoterReplicatedIndex => "request_voter_replicated_index", } } @@ -412,6 +426,8 @@ impl PeerTick { PeerTick::ReportBuckets, PeerTick::CheckLongUncommitted, PeerTick::CheckPeersAvailability, + PeerTick::RequestSnapshot, + PeerTick::RequestVoterReplicatedIndex, ]; TICKS } @@ -757,6 +773,8 @@ pub enum PeerMsg { Destroy(u64), } +impl ResourceMetered for PeerMsg {} + impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -852,6 +870,8 @@ where }, } +impl ResourceMetered for StoreMsg {} + impl fmt::Debug for StoreMsg where EK: KvEngine, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index a72bb59d8bf..a6010a6761f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -70,7 +70,7 @@ use uuid::Uuid; use super::{ cmd_resp, - local_metrics::{RaftMetrics, TimeTracker}, + local_metrics::RaftMetrics, metrics::*, peer_storage::{write_peer_state, CheckApplyingSnapStatus, HandleReadyResult, PeerStorage}, read_queue::{ReadIndexQueue, ReadIndexRequest}, @@ -141,16 +141,16 @@ impl ProposalQueue { /// Find the trackers of given index. /// Caller should check if term is matched before using trackers. - fn find_trackers(&self, index: u64) -> Option<(u64, &SmallVec<[TimeTracker; 4]>)> { + pub fn find_trackers(&self, index: u64) -> Option<(u64, C::TimeTrackerListRef<'_>)> { self.queue .binary_search_by_key(&index, |p: &Proposal<_>| p.index) .ok() - .and_then(|i| { - self.queue[i] - .cb - .write_trackers() - .map(|ts| (self.queue[i].term, ts)) - }) + .map(|i| (self.queue[i].term, self.queue[i].cb.write_trackers())) + } + + #[inline] + pub fn queue_mut(&mut self) -> &mut VecDeque> { + &mut self.queue } pub fn find_propose_time(&self, term: u64, index: u64) -> Option { @@ -200,7 +200,7 @@ impl ProposalQueue { } #[inline] - fn oldest(&self) -> Option<&Proposal> { + pub fn oldest(&self) -> Option<&Proposal> { self.queue.front() } @@ -894,6 +894,17 @@ where /// the same time period. pub wait_data: bool, + /// When the witness becomes non-witness, it need to actively request a + /// snapshot from the leader, but the request may fail, so we need to save + /// the request index for retrying. + pub request_index: u64, + + /// When the witness becomes non-witness, it need to actively request a + /// snapshot from the leader, In order to avoid log lag, we need to reject + /// the leader's `MsgAppend` request unless the `term` of the `last index` + /// is less than the peer's current `term`. + pub should_reject_msgappend: bool, + /// Force leader state is only used in online recovery when the majority of /// peers are missing. In this state, it forces one peer to become leader /// out of accordance with Raft election rule, and forbids any @@ -939,6 +950,15 @@ where /// The index of last compacted raft log. It is used for the next compact /// log task. pub last_compacted_idx: u64, + /// Record the time of the last raft log compact, the witness should query + /// the leader periodically whether `voter_replicated_index` is updated + /// if CompactLog admin command isn't triggered for a while. + pub last_compacted_time: Instant, + /// When the peer is witness, and there is any voter lagging behind, the + /// log truncation of the witness shouldn't be triggered even if it's + /// force mode, and this item will be set to `true`, after all pending + /// compact cmds have been handled, it will be set to `false`. + pub has_pending_compact_cmd: bool, /// The index of the latest urgent proposal index. last_urgent_proposal_idx: u64, /// The index of the latest committed split command. @@ -1046,6 +1066,7 @@ where engines: Engines, region: &metapb::Region, peer: metapb::Peer, + wait_data: bool, ) -> Result> { let peer_id = peer.get_id(); if peer_id == raft::INVALID_ID { @@ -1077,12 +1098,17 @@ where skip_bcast_commit: true, pre_vote: cfg.prevote, max_committed_size_per_ready: MAX_COMMITTED_SIZE_PER_READY, - // TODO: if peer.is_witness { 0 } else { 1 }, + priority: if peer.is_witness { -1 } else { 0 }, ..Default::default() }; let logger = slog_global::get_global().new(slog::o!("region_id" => region.get_id())); let raft_group = RawNode::new(&raft_cfg, ps, &logger)?; + let last_index = raft_group.store().last_index(); + // In order to avoid excessive log accumulation due to the loss of pending + // compaction cmds after the witness is restarted, it will actively pull + // voter_request_index once at start. + let has_pending_compact_cmd = peer.is_witness; let mut peer = Peer { peer, @@ -1105,7 +1131,9 @@ where compaction_declined_bytes: 0, leader_unreachable: false, pending_remove: false, - wait_data: false, + wait_data, + request_index: last_index, + should_reject_msgappend: false, should_wake_up: false, force_leader: None, pending_merge_state: None, @@ -1118,6 +1146,8 @@ where tag: tag.clone(), last_applying_idx: applied_index, last_compacted_idx: 0, + last_compacted_time: Instant::now(), + has_pending_compact_cmd, last_urgent_proposal_idx: u64::MAX, last_committed_split_idx: 0, last_sent_snapshot_idx: 0, @@ -1173,6 +1203,9 @@ where peer.raft_group.campaign()?; } + let persisted_index = peer.raft_group.raft.raft_log.persisted; + peer.mut_store().update_cache_persisted(persisted_index); + Ok(peer) } @@ -1574,6 +1607,14 @@ where res.reason = "replication mode"; return res; } + if !self.disk_full_peers.is_empty() { + res.reason = "has disk full peers"; + return res; + } + if !self.wait_data_peers.is_empty() { + res.reason = "has wait data peers"; + return res; + } res.up_to_date = true; res } @@ -1599,6 +1640,8 @@ where && !self.has_unresolved_reads() // If it becomes leader, the stats is not valid anymore. && !self.is_leader() + // Keep ticking if it's waiting for snapshot. + && !self.wait_data } } @@ -1807,7 +1850,7 @@ where { let proposal = &self.proposals.queue[idx]; if term == proposal.term { - for tracker in proposal.cb.write_trackers().iter().flat_map(|v| v.iter()) { + for tracker in proposal.cb.write_trackers() { tracker.observe(std_now, &ctx.raft_metrics.wf_send_proposal, |t| { &mut t.metrics.wf_send_proposal_nanos }); @@ -2043,6 +2086,12 @@ where let status = self.raft_group.status(); let truncated_idx = self.get_store().truncated_index(); + for peer_id in &self.wait_data_peers { + if let Some(p) = self.get_peer_from_cache(*peer_id) { + pending_peers.push(p); + } + } + if status.progress.is_none() { return pending_peers; } @@ -2119,6 +2168,9 @@ where if self.peers_start_pending_time[i].0 != peer_id { continue; } + if self.wait_data_peers.contains(&peer_id) { + continue; + } let truncated_idx = self.raft_group.store().truncated_index(); if let Some(progress) = self.raft_group.raft.prs().get(peer_id) { if progress.matched >= truncated_idx { @@ -2283,6 +2335,7 @@ where prev_lead_transferee: self.lead_transferee, vote: self.raft_group.raft.vote, initialized: self.is_initialized(), + peer_id: self.peer.get_id(), }, ); self.cmd_epoch_checker.maybe_update_term(self.term()); @@ -2376,8 +2429,12 @@ where // a stale heartbeat can make the leader think follower has already applied // the snapshot, and send remaining log entries, which may increase // commit_index. + // + // If it's witness before, but a command changes it to non-witness, it will stop + // applying all following command, therefore, add the judgment of `wait_data` to + // avoid applying snapshot is also blocked. // TODO: add more test - self.last_applying_idx == self.get_store().applied_index() + (self.last_applying_idx == self.get_store().applied_index() || self.wait_data) // Requesting snapshots also triggers apply workers to write // apply states even if there is no pending committed entry. // TODO: Instead of sharing the counter, we should apply snapshots @@ -2547,11 +2604,18 @@ where // i.e. call `RawNode::advance_apply_to`. self.post_pending_read_index_on_replica(ctx); // Resume `read_progress` + self.update_read_progress(ctx, ReadProgress::WaitData(false)); self.read_progress.resume(); // Update apply index to `last_applying_idx` self.read_progress .update_applied(self.last_applying_idx, &ctx.coprocessor_host); - self.notify_leader_the_peer_is_available(ctx); + if self.wait_data { + self.notify_leader_the_peer_is_available(ctx); + ctx.apply_router + .schedule_task(self.region_id, ApplyTask::Recover(self.region_id)); + self.wait_data = false; + return false; + } } CheckApplyingSnapStatus::Idle => { // FIXME: It's possible that the snapshot applying task is canceled. @@ -2572,22 +2636,19 @@ where &mut self, ctx: &mut PollContext, ) { - if self.wait_data { - self.wait_data = false; - fail_point!("ignore notify leader the peer is available", |_| {}); - let leader_id = self.leader_id(); - let leader = self.get_peer_from_cache(leader_id); - if let Some(leader) = leader { - let mut msg = ExtraMessage::default(); - msg.set_type(ExtraMessageType::MsgAvailabilityResponse); - msg.wait_data = false; - self.send_extra_message(msg, &mut ctx.trans, &leader); - info!( - "notify leader the leader is available"; - "region id" => self.region().get_id(), - "peer id" => self.peer.id - ); - } + fail_point!("ignore notify leader the peer is available", |_| {}); + let leader_id = self.leader_id(); + let leader = self.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityResponse); + msg.wait_data = false; + self.send_extra_message(msg, &mut ctx.trans, &leader); + info!( + "notify leader the peer is available"; + "region id" => self.region().get_id(), + "peer id" => self.peer.id + ); } } @@ -2749,8 +2810,8 @@ where for entry in ready.entries() { if let Some((term, times)) = self.proposals.find_trackers(entry.get_index()) { if entry.term == term { - trackers.extend_from_slice(times); for tracker in times { + trackers.push(*tracker); tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { &mut t.metrics.wf_send_to_queue_nanos }); @@ -3110,9 +3171,8 @@ where "after" => ?peer, ); self.peer = peer; - // TODO: set priority for witness - // self.raft_group - // .set_priority(if self.peer.is_witness { 0 } else { 1 }); + self.raft_group + .set_priority(if self.peer.is_witness { -1 } else { 0 }); }; self.activate(ctx); @@ -3274,7 +3334,7 @@ where let time = monotonic_raw_now(); for (req, cb, mut read_index) in read.take_cmds().drain(..) { cb.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read.propose_time).to_std().unwrap().as_nanos() as u64; }) @@ -3568,6 +3628,16 @@ where reader.update(progress); } + pub fn update_read_progress( + &self, + ctx: &mut PollContext, + progress: ReadProgress, + ) { + let mut meta = ctx.store_meta.lock().unwrap(); + let reader = meta.readers.get_mut(&self.region_id).unwrap(); + self.maybe_update_read_progress(reader, progress); + } + pub fn maybe_campaign(&mut self, parent_is_leader: bool) -> bool { if self.region().get_peers().len() <= 1 { // The peer campaigned when it was created, no need to do it again. @@ -3669,6 +3739,7 @@ where cb, propose_time: None, must_pass_epoch_check: has_applied_to_current_term, + sent: false, }; if let Some(cmd_type) = req_admin_cmd_type { self.cmd_epoch_checker @@ -4000,6 +4071,7 @@ where cb: Callback::None, propose_time: Some(now), must_pass_epoch_check: false, + sent: false, }; self.post_propose(poll_ctx, p); } @@ -4414,13 +4486,10 @@ where msg: &eraftpb::Message, peer_disk_usage: DiskUsage, ) -> bool { - if self.is_witness() { - // shouldn't transfer leader to witness peer - return true; - } - let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); - if pending_snapshot + // shouldn't transfer leader to witness peer or non-witness waiting data + if self.is_witness() || self.wait_data + || pending_snapshot || msg.get_from() != self.leader_id() // Transfer leader to node with disk full will lead to write availablity downback. // But if the current leader is disk full, and send such request, we should allow it, @@ -4435,6 +4504,8 @@ where "from" => msg.get_from(), "pending_snapshot" => pending_snapshot, "disk_usage" => ?ctx.self_disk_usage, + "is_witness" => self.is_witness(), + "wait_data" => self.wait_data, ); return true; } @@ -4773,7 +4844,7 @@ where return; } if let Some(ref state) = self.pending_merge_state { - if state.get_commit() == extra_msg.get_premerge_commit() { + if state.get_commit() == extra_msg.get_index() { self.add_want_rollback_merge_peer(peer_id); } } @@ -5368,7 +5439,7 @@ where }; let mut extra_msg = ExtraMessage::default(); extra_msg.set_type(ExtraMessageType::MsgWantRollbackMerge); - extra_msg.set_premerge_commit(premerge_commit); + extra_msg.set_index(premerge_commit); self.send_extra_message(extra_msg, &mut ctx.trans, &to_peer); } @@ -5725,6 +5796,7 @@ mod tests { AdminCmdType::ComputeHash, AdminCmdType::VerifyHash, AdminCmdType::BatchSwitchWitness, + AdminCmdType::UpdateGcPeer, ]; for tp in AdminCmdType::values() { let mut msg = RaftCmdRequest::default(); @@ -5923,6 +5995,7 @@ mod tests { cb: Callback::write(Box::new(|_| {})), propose_time: Some(u64_to_timespec(index)), must_pass_epoch_check: false, + sent: false, }); }; for index in 1..=100 { @@ -5996,6 +6069,7 @@ mod tests { is_conf_change: false, propose_time: None, must_pass_epoch_check: false, + sent: false, }); } for (index, term) in entries { diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index c9e460d1cbc..8dc8a18906c 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -449,6 +449,11 @@ where /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no /// available snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + fail_point!("ignore generate snapshot", self.peer_id == 1, |_| { + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) + }); if self.peer.as_ref().unwrap().is_witness { // witness could be the leader for a while, do not generate snapshot now return Err(raft::Error::Store( @@ -457,6 +462,18 @@ where } if find_peer_by_id(&self.region, to).map_or(false, |p| p.is_witness) { + // Although we always sending snapshot task behind apply task to get latest + // snapshot, we can't use `last_applying_idx` here, as below the judgment + // condition will generate an witness snapshot directly, the new non-witness + // will ingore this mismatch snapshot and can't request snapshot successfully + // again. + if self.applied_index() < request_index { + // It may be a request from non-witness. In order to avoid generating mismatch + // snapshots, wait for apply non-witness to complete + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } // generate an empty snapshot for witness directly return Ok(util::new_empty_snapshot( self.region.clone(), @@ -666,6 +683,7 @@ where "peer_id" => self.peer_id, "region" => ?region, "state" => ?self.apply_state(), + "for_witness" => for_witness, ); Ok((region, for_witness)) @@ -2082,7 +2100,7 @@ pub mod tests { let mut lb = engines.raft.log_batch(4096); // last_index < commit_index is invalid. raft_state.set_last_index(11); - lb.append(1, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) + lb.append(1, None, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) .unwrap(); raft_state.mut_hard_state().set_commit(12); lb.put_raft_state(1, &raft_state).unwrap(); @@ -2093,7 +2111,7 @@ pub mod tests { let entries = (12..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); s = build_storage().unwrap(); @@ -2138,7 +2156,7 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); engines.raft.gc(1, 0, 21, &mut lb).unwrap(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); raft_state.mut_hard_state().set_commit(14); s = build_storage().unwrap(); @@ -2150,7 +2168,7 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); entries[0].set_term(RAFT_INIT_LOG_TERM - 1); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); @@ -2158,7 +2176,7 @@ pub mod tests { let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM - 1); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); @@ -2168,7 +2186,7 @@ pub mod tests { engines.raft.gc(1, 0, 21, &mut lb).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.set_last_index(13); - lb.append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) + lb.append(1, None, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) .unwrap(); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 05decd62815..358ec716195 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -207,7 +207,9 @@ fn retry_delete_snapshot(mgr: &SnapManagerCore, key: &SnapKey, snap: &Snapshot) false } -fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { +// Create a SnapshotMeta that can be later put into RaftSnapshotData or written +// into file. +pub fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { let mut meta = Vec::with_capacity(cf_files.len()); for cf_file in cf_files { if !SNAPSHOT_CFS.iter().any(|cf| cf_file.cf == *cf) { @@ -663,7 +665,8 @@ impl Snapshot { Ok(snapshot_meta) } - fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { + // Validate and set SnapshotMeta of this Snapshot. + pub fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { let mut cf_file_count_from_meta: Vec = vec![]; let mut file_count = 0; let mut current_cf = ""; @@ -812,8 +815,9 @@ impl Snapshot { } } - // Only called in `do_build`. - fn save_meta_file(&mut self) -> RaftStoreResult<()> { + // Save `SnapshotMeta` to file. + // Used in `do_build` and by external crates. + pub fn save_meta_file(&mut self) -> RaftStoreResult<()> { let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { // `meta_file` could be None for this case: in `init_for_building` the snapshot @@ -1125,6 +1129,10 @@ impl Snapshot { file_system::metadata(&self.meta_file.path) } + pub fn meta_path(&self) -> &PathBuf { + &self.meta_file.path + } + pub fn total_size(&self) -> u64 { self.cf_files .iter() @@ -1998,7 +2006,12 @@ impl TabletSnapManager { { continue; } - for e in file_system::read_dir(path)? { + let entries = match file_system::read_dir(path) { + Ok(entries) => entries, + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + for e in entries { match e.and_then(|e| e.metadata()) { Ok(m) => total_size += m.len(), Err(e) if e.kind() == ErrorKind::NotFound => continue, @@ -2149,7 +2162,7 @@ pub mod tests { apply_entry.set_term(0); apply_state.mut_truncated_state().set_index(10); kv.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - lb.append(region_id, vec![apply_entry])?; + lb.append(region_id, None, vec![apply_entry])?; // Put region info into kv engine. let region = gen_test_region(region_id, 1, 1); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 2d27b56fda5..0344adb2b92 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -24,14 +24,16 @@ use kvproto::{ }, raft_serverpb::{RaftMessage, RaftSnapshotData}, }; -use protobuf::{self, Message}; +use protobuf::{self, CodedInputStream, Message}; use raft::{ - eraftpb::{self, ConfChangeType, ConfState, MessageType, Snapshot}, + eraftpb::{self, ConfChangeType, ConfState, Entry, EntryType, MessageType, Snapshot}, Changer, RawNode, INVALID_INDEX, }; use raft_proto::ConfChangeI; use tikv_util::{ - box_err, debug, info, + box_err, + codec::number::{decode_u64, NumberEncoder}, + debug, info, store::{find_peer_by_id, region}, time::monotonic_raw_now, Either, @@ -228,7 +230,8 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { AdminCmdEpochState::new(true, true, false, false) } - AdminCmdType::BatchSwitchWitness => unimplemented!(), + AdminCmdType::BatchSwitchWitness => AdminCmdEpochState::new(false, true, false, true), + AdminCmdType::UpdateGcPeer => AdminCmdEpochState::new(false, false, false, false), } } @@ -335,6 +338,7 @@ pub fn compare_region_epoch( // flashback. pub fn check_flashback_state( is_in_flashback: bool, + flashback_start_ts: u64, req: &RaftCmdRequest, region_id: u64, skip_not_prepared: bool, @@ -346,11 +350,20 @@ pub fn check_flashback_state( { return Ok(()); } + // TODO: only use `flashback_start_ts` to check flashback state. + let is_in_flashback = is_in_flashback || flashback_start_ts > 0; let is_flashback_request = WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) .contains(WriteBatchFlags::FLASHBACK); - // If the region is in the flashback state, the only allowed request is the - // flashback request itself. + // If the region is in the flashback state: + // - A request with flashback flag will be allowed. + // - A read request whose `read_ts` is smaller than `flashback_start_ts` will + // be allowed. if is_in_flashback && !is_flashback_request { + if let Ok(read_ts) = decode_u64(&mut req.get_header().get_flag_data()) { + if read_ts != 0 && read_ts < flashback_start_ts { + return Ok(()); + } + } return Err(Error::FlashbackInProgress(region_id)); } // If the region is not in the flashback state, the flashback request itself @@ -361,6 +374,12 @@ pub fn check_flashback_state( Ok(()) } +pub fn encode_start_ts_into_flag_data(header: &mut RaftRequestHeader, start_ts: u64) { + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(start_ts).unwrap(); + header.set_flag_data(data.into()); +} + pub fn is_region_epoch_equal( from_epoch: &metapb::RegionEpoch, current_epoch: &metapb::RegionEpoch, @@ -725,6 +744,24 @@ pub(crate) fn u64_to_timespec(u: u64) -> Timespec { Timespec::new(sec as i64, nsec as i32) } +pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { + if entry.get_entry_type() != EntryType::EntryNormal { + return RaftRequestHeader::default(); + } + // request header is encoded into data + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return RaftRequestHeader::default(); + } + let (field_number, _) = is.read_tag_unpack().unwrap(); + let t = is.read_message().unwrap(); + // Header field is of number 1 + if field_number != 1 { + panic!("unexpected field number: {} {:?}", field_number, t); + } + t +} + /// Parse data of entry `index`. /// /// # Panics @@ -1671,6 +1708,7 @@ mod tests { metapb::{self, RegionEpoch}, raft_cmdpb::AdminRequest, }; + use protobuf::Message as _; use raft::eraftpb::{ConfChangeType, Entry, Message, MessageType}; use tikv_util::store::new_peer; use time::Duration as TimeDuration; @@ -1749,6 +1787,20 @@ mod tests { assert_eq!(m1.inspect(Some(monotonic_raw_now())), LeaseState::Valid); } + #[test] + fn test_get_entry_header() { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("test".to_owned()); + req.set_header(header); + let mut entry = Entry::new(); + entry.set_term(1); + entry.set_index(2); + entry.set_data(req.write_to_bytes().unwrap().into()); + let header = get_entry_header(&entry); + assert_eq!(header.get_resource_group_name(), "test"); + } + #[test] fn test_timespec_u64() { let cases = vec![ diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 5861e27a508..e6c3c505cdf 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -59,6 +59,7 @@ make_static_metric! { witness, flashback_not_prepared, flashback_in_progress, + wait_data, } pub struct LocalReadRejectCounter : LocalIntCounter { diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index e021651ba3d..ac23f4e58d5 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -27,7 +27,8 @@ pub use self::{ consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, - Runner as PdRunner, Task as PdTask, + Runner as PdRunner, StatsMonitor as PdStatsMonitor, StoreStatsReporter, Task as PdTask, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, read::{ @@ -44,5 +45,5 @@ pub use self::{ Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, }, split_config::{SplitConfig, SplitConfigManager}, - split_controller::{AutoSplitController, ReadStats, SplitConfigChange, WriteStats}, + split_controller::{AutoSplitController, ReadStats, SplitConfigChange, SplitInfo, WriteStats}, }; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index fdfa1b44c85..18ecc77f599 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -25,8 +25,8 @@ use kvproto::{ kvrpcpb::DiskFullOpt, metapb, pdpb, raft_cmdpb::{ - AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, - SplitRequest, + AdminCmdType, AdminRequest, BatchSwitchWitnessRequest, ChangePeerRequest, + ChangePeerV2Request, RaftCmdRequest, SplitRequest, SwitchWitnessRequest, }, raft_serverpb::RaftMessage, replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, @@ -69,6 +69,8 @@ use crate::{ }, }; +pub const NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT: u32 = 2; + type RecordPairVec = Vec; #[derive(Default, Debug, Clone)] @@ -189,7 +191,6 @@ where id: u64, duration: RaftstoreDuration, }, - UpdateRegionCpuCollector(bool), RegionCpuRecords(Arc), ReportMinResolvedTs { store_id: u64, @@ -267,7 +268,7 @@ pub struct PeerStat { } #[derive(Default)] -pub struct ReportBucket { +struct ReportBucket { current_stat: BucketStat, last_report_stat: Option, last_report_ts: UnixSecs, @@ -418,12 +419,6 @@ where Task::UpdateSlowScore { id, ref duration } => { write!(f, "compute slow score: id {}, duration {:?}", id, duration) } - Task::UpdateRegionCpuCollector(is_register) => { - if is_register { - return write!(f, "register region cpu collector"); - } - write!(f, "deregister region cpu collector") - } Task::RegionCpuRecords(ref cpu_records) => { write!(f, "get region cpu records: {:?}", cpu_records) } @@ -476,12 +471,83 @@ fn convert_record_pairs(m: HashMap) -> RecordPairVec { .collect() } -struct StatsMonitor +#[derive(Clone)] +pub struct WrappedScheduler(Scheduler>); + +impl Collector for WrappedScheduler where EK: KvEngine, ER: RaftEngine, { - scheduler: Scheduler>, + fn collect(&self, records: Arc) { + self.0.schedule(Task::RegionCpuRecords(records)).ok(); + } +} + +pub trait StoreStatsReporter: Send + Clone + Sync + 'static + Collector { + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ); + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64); + fn auto_split(&self, split_infos: Vec); +} + +impl StoreStatsReporter for WrappedScheduler +where + EK: KvEngine, + ER: RaftEngine, +{ + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + let task = Task::StoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send store infos to pd worker"; + "err" => ?e, + ); + } + } + + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64) { + let task = Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send min resolved ts to pd worker"; + "err" => ?e, + ); + } + } + + fn auto_split(&self, split_infos: Vec) { + let task = Task::AutoSplit { split_infos }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send split infos to pd worker"; + "err" => ?e, + ); + } + } +} + +pub struct StatsMonitor +where + T: StoreStatsReporter, +{ + reporter: T, handle: Option>, timer: Option>, read_stats_sender: Option>, @@ -492,18 +558,13 @@ where report_min_resolved_ts_interval: Duration, } -impl StatsMonitor +impl StatsMonitor where - EK: KvEngine, - ER: RaftEngine, + T: StoreStatsReporter, { - pub fn new( - interval: Duration, - report_min_resolved_ts_interval: Duration, - scheduler: Scheduler>, - ) -> Self { + pub fn new(interval: Duration, report_min_resolved_ts_interval: Duration, reporter: T) -> Self { StatsMonitor { - scheduler, + reporter, handle: None, timer: None, read_stats_sender: None, @@ -524,11 +585,10 @@ where &mut self, mut auto_split_controller: AutoSplitController, region_read_progress: RegionReadProgressRegistry, + collector_reg_handle: CollectorRegHandle, store_id: u64, ) -> Result<(), io::Error> { - if self.collect_tick_interval < default_collect_tick_interval() - || self.collect_store_infos_interval < self.collect_tick_interval - { + if self.collect_tick_interval < default_collect_tick_interval() { info!( "interval is too small, skip stats monitoring. If we are running tests, it is normal, otherwise a check is needed." ); @@ -555,7 +615,7 @@ where let (cpu_stats_sender, cpu_stats_receiver) = mpsc::channel(); self.cpu_stats_sender = Some(cpu_stats_sender); - let scheduler = self.scheduler.clone(); + let reporter = self.reporter.clone(); let props = tikv_util::thread_group::current_properties(); fn is_enable_tick(timer_cnt: u64, interval: u64) -> bool { @@ -570,13 +630,23 @@ where // make sure the record won't be disturbed. let mut collect_store_infos_thread_stats = ThreadInfoStatistics::new(); let mut load_base_split_thread_stats = ThreadInfoStatistics::new(); + let mut region_cpu_records_collector = None; + // Register the region CPU records collector. + if auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio + > 0.0 + { + region_cpu_records_collector = + Some(collector_reg_handle.register(Box::new(reporter.clone()), false)); + } while let Err(mpsc::RecvTimeoutError::Timeout) = timer_rx.recv_timeout(tick_interval) { if is_enable_tick(timer_cnt, collect_store_infos_interval) { StatsMonitor::collect_store_infos( &mut collect_store_infos_thread_stats, - &scheduler, + &reporter, ); } if is_enable_tick(timer_cnt, load_base_split_check_interval) { @@ -585,14 +655,15 @@ where &read_stats_receiver, &cpu_stats_receiver, &mut load_base_split_thread_stats, - &scheduler, + &reporter, + &collector_reg_handle, + &mut region_cpu_records_collector, ); } if is_enable_tick(timer_cnt, report_min_resolved_ts_interval) { - StatsMonitor::report_min_resolved_ts( - ®ion_read_progress, + reporter.report_min_resolved_ts( store_id, - &scheduler, + region_read_progress.get_min_resolved_ts(), ); } timer_cnt += 1; @@ -604,26 +675,13 @@ where Ok(()) } - pub fn collect_store_infos( - thread_stats: &mut ThreadInfoStatistics, - scheduler: &Scheduler>, - ) { + pub fn collect_store_infos(thread_stats: &mut ThreadInfoStatistics, reporter: &T) { thread_stats.record(); let cpu_usages = convert_record_pairs(thread_stats.get_cpu_usages()); let read_io_rates = convert_record_pairs(thread_stats.get_read_io_rates()); let write_io_rates = convert_record_pairs(thread_stats.get_write_io_rates()); - let task = Task::StoreInfos { - cpu_usages, - read_io_rates, - write_io_rates, - }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send store infos to pd worker"; - "err" => ?e, - ); - } + reporter.report_store_infos(cpu_usages, read_io_rates, write_io_rates); } pub fn load_base_split( @@ -631,16 +689,19 @@ where read_stats_receiver: &Receiver, cpu_stats_receiver: &Receiver>, thread_stats: &mut ThreadInfoStatistics, - scheduler: &Scheduler>, + reporter: &T, + collector_reg_handle: &CollectorRegHandle, + region_cpu_records_collector: &mut Option, ) { let start_time = TiInstant::now(); match auto_split_controller.refresh_and_check_cfg() { SplitConfigChange::UpdateRegionCpuCollector(is_register) => { - if let Err(e) = scheduler.schedule(Task::UpdateRegionCpuCollector(is_register)) { - error!( - "failed to register or deregister the region cpu collector"; - "is_register" => is_register, - "err" => ?e, + // If it's a deregister task, just take and drop the original collector. + if !is_register { + region_cpu_records_collector.take(); + } else { + region_cpu_records_collector.get_or_insert( + collector_reg_handle.register(Box::new(reporter.clone()), false), ); } } @@ -658,13 +719,7 @@ where let (top_qps, split_infos) = auto_split_controller.flush(read_stats_vec, cpu_stats_vec, thread_stats); auto_split_controller.clear(); - let task = Task::AutoSplit { split_infos }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send split infos to pd worker"; - "err" => ?e, - ); - } + reporter.auto_split(split_infos); for i in 0..TOP_N { if i < top_qps.len() { READ_QPS_TOPN @@ -677,23 +732,6 @@ where LOAD_BASE_SPLIT_DURATION_HISTOGRAM.observe(start_time.saturating_elapsed_secs()); } - pub fn report_min_resolved_ts( - region_read_progress: &RegionReadProgressRegistry, - store_id: u64, - scheduler: &Scheduler>, - ) { - let task = Task::ReportMinResolvedTs { - store_id, - min_resolved_ts: region_read_progress.get_min_resolved_ts(), - }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send min resolved ts to pd worker"; - "err" => ?e, - ); - } - } - pub fn stop(&mut self) { if let Some(h) = self.handle.take() { drop(self.timer.take()); @@ -705,14 +743,22 @@ where } } - #[inline(always)] - fn get_read_stats_sender(&self) -> &Option> { - &self.read_stats_sender + #[inline] + pub fn maybe_send_read_stats(&self, read_stats: ReadStats) { + if let Some(sender) = &self.read_stats_sender { + if sender.send(read_stats).is_err() { + warn!("send read_stats failed, are we shutting down?") + } + } } - #[inline(always)] - fn get_cpu_stats_sender(&self) -> &Option>> { - &self.cpu_stats_sender + #[inline] + pub fn maybe_send_cpu_stats(&self, cpu_stats: &Arc) { + if let Some(sender) = &self.cpu_stats_sender { + if sender.send(cpu_stats.clone()).is_err() { + warn!("send region cpu info failed, are we shutting down?") + } + } } } @@ -845,37 +891,6 @@ impl SlowScore { } } -// RegionCpuMeteringCollector is used to collect the region-related CPU info. -struct RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - scheduler: Scheduler>, -} - -impl RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - fn new(scheduler: Scheduler>) -> RegionCpuMeteringCollector { - RegionCpuMeteringCollector { scheduler } - } -} - -impl Collector for RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - fn collect(&self, records: Arc) { - self.scheduler - .schedule(Task::RegionCpuRecords(records)) - .ok(); - } -} - pub struct Runner where EK: KvEngine, @@ -896,11 +911,9 @@ where // actually it is the sender connected to Runner's Worker which // calls Runner's run() on Task received. scheduler: Scheduler>, - stats_monitor: StatsMonitor, + stats_monitor: StatsMonitor>, store_heartbeat_interval: Duration, - collector_reg_handle: CollectorRegHandle, - region_cpu_records_collector: Option, // region_id -> total_cpu_time_ms (since last region heartbeat) region_cpu_records: HashMap, @@ -922,15 +935,12 @@ where ER: RaftEngine, T: PdClient + 'static, { - const INTERVAL_DIVISOR: u32 = 2; - pub fn new( cfg: &Config, store_id: u64, pd_client: Arc, router: RaftRouter, scheduler: Scheduler>, - store_heartbeat_interval: Duration, auto_split_controller: AutoSplitController, concurrency_manager: ConcurrencyManager, snap_mgr: SnapManager, @@ -941,25 +951,19 @@ where coprocessor_host: CoprocessorHost, causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Runner { - // Register the region CPU records collector. - let mut region_cpu_records_collector = None; - if auto_split_controller - .cfg - .region_cpu_overload_threshold_ratio - > 0.0 - { - region_cpu_records_collector = Some(collector_reg_handle.register( - Box::new(RegionCpuMeteringCollector::new(scheduler.clone())), - false, - )); - } - let interval = store_heartbeat_interval / Self::INTERVAL_DIVISOR; + let store_heartbeat_interval = cfg.pd_store_heartbeat_tick_interval.0; + let interval = store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT; let mut stats_monitor = StatsMonitor::new( interval, cfg.report_min_resolved_ts_interval.0, - scheduler.clone(), + WrappedScheduler(scheduler.clone()), ); - if let Err(e) = stats_monitor.start(auto_split_controller, region_read_progress, store_id) { + if let Err(e) = stats_monitor.start( + auto_split_controller, + region_read_progress, + collector_reg_handle, + store_id, + ) { error!("failed to start stats collector, error = {:?}", e); } @@ -975,8 +979,6 @@ where scheduler, store_heartbeat_interval, stats_monitor, - collector_reg_handle, - region_cpu_records_collector, region_cpu_records: HashMap::default(), concurrency_manager, snap_mgr, @@ -1041,21 +1043,6 @@ where self.remote.spawn(f); } - fn handle_update_region_cpu_collector(&mut self, is_register: bool) { - // If it's a deregister task, just take and drop the original collector. - if !is_register { - self.region_cpu_records_collector.take(); - return; - } - if self.region_cpu_records_collector.is_some() { - return; - } - self.region_cpu_records_collector = Some(self.collector_reg_handle.register( - Box::new(RegionCpuMeteringCollector::new(self.scheduler.clone())), - false, - )); - } - // Note: The parameter doesn't contain `self` because this function may // be called in an asynchronous context. fn handle_ask_batch_split( @@ -1564,6 +1551,18 @@ where deadline:None, disk_full_opt:DiskFullOpt::AllowedOnAlmostFull, }); + } else if resp.has_switch_witnesses() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["switch witness"]) + .inc(); + + let mut switches = resp.take_switch_witnesses(); + info!("try to switch witness"; + "region_id" => region_id, + "switch witness" => ?switches + ); + let req = new_batch_switch_witness(switches.take_switch_witnesses().into()); + send_admin_request(&router, region_id, epoch, peer, req, Callback::None, Default::default()); } else { PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["noop"]).inc(); } @@ -1604,11 +1603,7 @@ where self.merge_buckets(region_buckets); } if !read_stats.region_infos.is_empty() { - if let Some(sender) = self.stats_monitor.get_read_stats_sender() { - if sender.send(read_stats).is_err() { - warn!("send read_stats failed, are we shutting down?") - } - } + self.stats_monitor.maybe_send_read_stats(read_stats); } } @@ -1756,11 +1751,7 @@ where // TODO: more accurate CPU consumption of a specified region. fn handle_region_cpu_records(&mut self, records: Arc) { // Send Region CPU info to AutoSplitController inside the stats_monitor. - if let Some(cpu_stats_sender) = self.stats_monitor.get_cpu_stats_sender() { - if cpu_stats_sender.send(records.clone()).is_err() { - warn!("send region cpu info failed, are we shutting down?") - } - } + self.stats_monitor.maybe_send_cpu_stats(&records); calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); } @@ -1856,22 +1847,10 @@ where stats.set_is_busy(true); // We do not need to report store_info, so we just set `None` here. - let task = Task::StoreHeartbeat { - stats, - store_info: None, - report: None, - dr_autosync_status: None, - }; - if let Err(e) = self.scheduler.schedule(task) { - error!("force report store heartbeat failed"; - "store_id" => self.store_id, - "err" => ?e - ); - } else { - warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; - "store_id" => self.store_id, - ); - } + self.handle_store_heartbeat(stats, None, None, None); + warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; + "store_id" => self.store_id, + ); } fn is_store_heartbeat_delayed(&self) -> bool { @@ -1954,48 +1933,43 @@ where let f = async move { for split_info in split_infos { - if let Ok(Some(region)) = - pd_client.get_region_by_id(split_info.region_id).await - { - // Try to split the region with the given split key. - if let Some(split_key) = split_info.split_key { - Self::handle_ask_batch_split( - router.clone(), - scheduler.clone(), - pd_client.clone(), - region, - vec![split_key], - split_info.peer, - true, - Callback::None, - String::from("auto_split"), - remote.clone(), + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { + Self::handle_ask_batch_split( + router.clone(), + scheduler.clone(), + pd_client.clone(), + region, + vec![split_key], + split_info.peer, + true, + Callback::None, + String::from("auto_split"), + remote.clone(), + ); + // Try to split the region on half within the given key + // range if there is no `split_key` been given. + } else if split_info.start_key.is_some() && split_info.end_key.is_some() { + let start_key = split_info.start_key.unwrap(); + let end_key = split_info.end_key.unwrap(); + let region_id = region.get_id(); + let msg = CasualMessage::HalfSplitRegion { + region_epoch: region.get_region_epoch().clone(), + start_key: Some(start_key.clone()), + end_key: Some(end_key.clone()), + policy: pdpb::CheckPolicy::Scan, + source: "auto_split", + cb: Callback::None, + }; + if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { + error!("send auto half split request failed"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => ?e, ); - return; - } - // Try to split the region on half within the given key range - // if there is no `split_key` been given. - if split_info.start_key.is_some() && split_info.end_key.is_some() { - let start_key = split_info.start_key.unwrap(); - let end_key = split_info.end_key.unwrap(); - let region_id = region.get_id(); - let msg = CasualMessage::HalfSplitRegion { - region_epoch: region.get_region_epoch().clone(), - start_key: Some(start_key.clone()), - end_key: Some(end_key.clone()), - policy: pdpb::CheckPolicy::Scan, - source: "auto_split", - cb: Callback::None, - }; - if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) - { - error!("send auto half split request failed"; - "region_id" => region_id, - "start_key" => log_wrappers::Value::key(&start_key), - "end_key" => log_wrappers::Value::key(&end_key), - "err" => ?e, - ); - } } } } @@ -2124,9 +2098,6 @@ where } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), Task::UpdateSlowScore { id, duration } => self.slow_score.record(id, duration.sum()), - Task::UpdateRegionCpuCollector(is_register) => { - self.handle_update_region_cpu_collector(is_register) - } Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), Task::ReportMinResolvedTs { store_id, @@ -2298,6 +2269,24 @@ fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { req } +fn new_batch_switch_witness(switches: Vec) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSwitchWitness); + let switch_reqs = switches + .into_iter() + .map(|s| { + let mut sw = SwitchWitnessRequest::default(); + sw.set_peer_id(s.get_peer_id()); + sw.set_is_witness(s.get_is_witness()); + sw + }) + .collect(); + let mut sw = BatchSwitchWitnessRequest::default(); + sw.set_switch_witnesses(switch_reqs); + req.set_switch_witnesses(sw); + req +} + fn send_admin_request( router: &RaftRouter, region_id: u64, @@ -2469,7 +2458,7 @@ mod tests { struct RunnerTest { store_stat: Arc>, - stats_monitor: StatsMonitor, + stats_monitor: StatsMonitor>, } impl RunnerTest { @@ -2481,13 +2470,16 @@ mod tests { let mut stats_monitor = StatsMonitor::new( Duration::from_secs(interval), Duration::from_secs(0), - scheduler, + WrappedScheduler(scheduler), ); let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); - if let Err(e) = - stats_monitor.start(AutoSplitController::default(), region_read_progress, 1) - { + if let Err(e) = stats_monitor.start( + AutoSplitController::default(), + region_read_progress, + CollectorRegHandle::new_for_test(), + 1, + ) { error!("failed to start stats collector, error = {:?}", e); } diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index ce829ed61b2..3edabae71a0 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -214,7 +214,7 @@ mod tests { for i in 0..100 { let mut e = Entry::new(); e.set_index(i); - raft_wb.append(region_id, vec![e]).unwrap(); + raft_wb.append(region_id, None, vec![e]).unwrap(); } raft_db.consume(&mut raft_wb, false /* sync */).unwrap(); diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a8fc2e6e3df..5d6835666b4 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -286,7 +286,7 @@ impl Drop for ReadDelegate { /// #[RaftstoreCommon] pub trait ReadExecutorProvider: Send + Clone + 'static { - type Executor: ReadExecutor; + type Executor; type StoreMeta; fn store_id(&self) -> Option; @@ -412,6 +412,8 @@ pub struct ReadDelegate { pub txn_ext: Arc, pub read_progress: Arc, pub pending_remove: bool, + /// Indicates whether the peer is waiting data. See more in `Peer`. + pub wait_data: bool, // `track_ver` used to keep the local `ReadDelegate` in `LocalReader` // up-to-date with the global `ReadDelegate` stored at `StoreMeta` @@ -435,6 +437,7 @@ impl ReadDelegate { txn_ext: peer.txn_ext.clone(), read_progress: peer.read_progress.clone(), pending_remove: false, + wait_data: false, bucket_meta: peer.region_buckets.as_ref().map(|b| b.meta.clone()), track_ver: TrackVer::new(), } @@ -463,6 +466,7 @@ impl ReadDelegate { txn_ext, read_progress, pending_remove: false, + wait_data: false, bucket_meta, track_ver: TrackVer::new(), } @@ -496,6 +500,9 @@ impl ReadDelegate { Progress::RegionBuckets(bucket_meta) => { self.bucket_meta = Some(bucket_meta); } + Progress::WaitData(wait_data) => { + self.wait_data = wait_data; + } } } @@ -591,6 +598,7 @@ impl ReadDelegate { txn_ext: Default::default(), read_progress, pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, } @@ -620,6 +628,7 @@ pub enum Progress { AppliedTerm(u64), LeaderLease(RemoteLease), RegionBuckets(Arc), + WaitData(bool), } impl Progress { @@ -642,6 +651,10 @@ impl Progress { pub fn region_buckets(bucket_meta: Arc) -> Progress { Progress::RegionBuckets(bucket_meta) } + + pub fn wait_data(wait_data: bool) -> Progress { + Progress::WaitData(wait_data) + } } struct SnapCache @@ -687,11 +700,7 @@ where /// #[RaftstoreCommon]: LocalReader is an entry point where local read requests are dipatch to the /// relevant regions by LocalReader so that these requests can be handled by the /// relevant ReadDelegate respectively. -pub struct LocalReaderCore -where - D: ReadExecutor + Deref, - S: ReadExecutorProvider, -{ +pub struct LocalReaderCore { pub store_id: Cell>, store_meta: S, pub delegates: LruCache, @@ -699,7 +708,7 @@ where impl LocalReaderCore where - D: ReadExecutor + Deref + Clone, + D: Deref + Clone, S: ReadExecutorProvider, { pub fn new(store_meta: S) -> Self { @@ -801,13 +810,22 @@ where // Check witness if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); - return Err(Error::RecoveryInProgress(region_id)); + return Err(Error::IsWitness(region_id)); + } + + // Check non-witness hasn't finish applying snapshot yet. + if delegate.wait_data { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.wait_data.inc()); + return Err(Error::IsWitness(region_id)); } // Check whether the region is in the flashback state and the local read could // be performed. let is_in_flashback = delegate.region.is_in_flashback; - if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id, false) { + let flashback_start_ts = delegate.region.flashback_start_ts; + if let Err(e) = + util::check_flashback_state(is_in_flashback, flashback_start_ts, req, region_id, false) + { TLS_LOCAL_READ_METRICS.with(|m| match e { Error::FlashbackNotPrepared(_) => { m.borrow_mut().reject_reason.flashback_not_prepared.inc() @@ -827,8 +845,7 @@ where impl Clone for LocalReaderCore where - D: ReadExecutor + Deref, - S: ReadExecutorProvider, + S: Clone, { fn clone(&self) -> Self { LocalReaderCore { @@ -1304,6 +1321,7 @@ mod tests { txn_ext: Arc::new(TxnExt::default()), read_progress: read_progress.clone(), pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, }; @@ -1595,6 +1613,7 @@ mod tests { track_ver: TrackVer::new(), read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)), pending_remove: false, + wait_data: false, bucket_meta: None, }; meta.readers.insert(1, read_delegate); @@ -1720,6 +1739,7 @@ mod tests { txn_ext: Arc::new(TxnExt::default()), read_progress, pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, }; diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index 6555e96f102..d09a6dd9f53 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -43,7 +43,7 @@ where for _ in 0..size { if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty) { error!( - "failed to decrese thread pool"; + "failed to decrease thread pool"; "decrease to" => size, "err" => %e, ); diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index a78e903bc72..fd58fac1601 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -149,6 +149,7 @@ pub struct LeadershipResolver { region_map: HashMap>, // region_id -> peers id, record the responses. resp_map: HashMap>, + checking_regions: HashSet, valid_regions: HashSet, gc_interval: Duration, @@ -176,6 +177,7 @@ impl LeadershipResolver { region_map: HashMap::default(), resp_map: HashMap::default(), valid_regions: HashSet::default(), + checking_regions: HashSet::default(), last_gc_time: Instant::now_coarse(), gc_interval, } @@ -188,6 +190,7 @@ impl LeadershipResolver { self.region_map = HashMap::default(); self.resp_map = HashMap::default(); self.valid_regions = HashSet::default(); + self.checking_regions = HashSet::default(); self.last_gc_time = now; } } @@ -203,6 +206,7 @@ impl LeadershipResolver { for v in self.resp_map.values_mut() { v.clear(); } + self.checking_regions.clear(); self.valid_regions.clear(); } @@ -248,7 +252,11 @@ impl LeadershipResolver { // This function broadcasts a special message to all stores, gets the leader id // of them to confirm whether current peer has a quorum which accepts its // leadership. - pub async fn resolve(&mut self, _regions: Vec, min_ts: TimeStamp) -> Vec { + pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + if regions.is_empty() { + return regions; + } + // Clear previous result before resolving. self.clear(); // GC when necessary to prevent memory leak. @@ -256,15 +264,22 @@ impl LeadershipResolver { PENDING_RTS_COUNT.inc(); defer!(PENDING_RTS_COUNT.dec()); - fail_point!("before_sync_replica_read_state", |_| _regions.clone()); + fail_point!("before_sync_replica_read_state", |_| regions.clone()); let store_id = self.store_id; let valid_regions = &mut self.valid_regions; let region_map = &mut self.region_map; let resp_map = &mut self.resp_map; let store_req_map = &mut self.store_req_map; + let checking_regions = &mut self.checking_regions; + for region_id in ®ions { + checking_regions.insert(*region_id); + } self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { + if !checking_regions.contains(region_id) { + continue; + } let core = read_progress.get_core(); let local_leader_info = core.get_local_leader_info(); let leader_id = local_leader_info.get_leader_id(); @@ -512,3 +527,112 @@ async fn get_tikv_client( RTS_TIKV_CLIENT_INIT_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); Ok(cli) } + +#[cfg(test)] +mod tests { + use std::{ + sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }, + time::Duration, + }; + + use grpcio::{self, ChannelBuilder, EnvBuilder, Server, ServerBuilder}; + use kvproto::{metapb::Region, tikvpb::Tikv, tikvpb_grpc::create_tikv}; + use pd_client::PdClient; + use raftstore::store::util::RegionReadProgress; + use tikv_util::store::new_peer; + + use super::*; + + #[derive(Clone)] + struct MockTikv { + req_tx: Sender, + } + + impl Tikv for MockTikv { + fn check_leader( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: CheckLeaderRequest, + sink: ::grpcio::UnarySink, + ) { + self.req_tx.send(req).unwrap(); + ctx.spawn(async { + sink.success(CheckLeaderResponse::default()).await.unwrap(); + }) + } + } + + struct MockPdClient {} + impl PdClient for MockPdClient {} + + fn new_rpc_suite(env: Arc) -> (Server, TikvClient, Receiver) { + let (tx, rx) = channel(); + let tikv_service = MockTikv { req_tx: tx }; + let builder = ServerBuilder::new(env.clone()).register_service(create_tikv(tikv_service)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(env).connect(&addr); + let client = TikvClient::new(channel); + (server, client, rx) + } + + #[tokio::test] + async fn test_resolve_leader_request_size() { + let env = Arc::new(EnvBuilder::new().build()); + let (mut server, tikv_client, rx) = new_rpc_suite(env.clone()); + + let mut region1 = Region::default(); + region1.id = 1; + region1.peers.push(new_peer(1, 1)); + region1.peers.push(new_peer(2, 11)); + let progress1 = RegionReadProgress::new(®ion1, 1, 1, 1); + progress1.update_leader_info(1, 1, ®ion1); + + let mut region2 = Region::default(); + region2.id = 2; + region2.peers.push(new_peer(1, 2)); + region2.peers.push(new_peer(2, 22)); + let progress2 = RegionReadProgress::new(®ion2, 1, 1, 2); + progress2.update_leader_info(2, 2, ®ion2); + + let mut leader_resolver = LeadershipResolver::new( + 1, // store id + Arc::new(MockPdClient {}), + env.clone(), + Arc::new(SecurityManager::default()), + RegionReadProgressRegistry::new(), + Duration::from_secs(1), + ); + leader_resolver + .tikv_clients + .lock() + .await + .insert(2 /* store id */, tikv_client); + leader_resolver + .region_read_progress + .insert(1, Arc::new(progress1)); + leader_resolver + .region_read_progress + .insert(2, Arc::new(progress2)); + + leader_resolver.resolve(vec![1, 2], TimeStamp::new(1)).await; + let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(req.regions.len(), 2); + + // Checking one region only send 1 region in request. + leader_resolver.resolve(vec![1], TimeStamp::new(1)).await; + let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(req.regions.len(), 1); + + // Checking zero region does not send request. + leader_resolver.resolve(vec![], TimeStamp::new(1)).await; + rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); + + let _ = server.shutdown().await; + } +} diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml new file mode 100644 index 00000000000..39d37ac0f6b --- /dev/null +++ b/components/resource_control/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "resource_control" +version = "0.0.1" +edition = "2021" +publish = false + +[features] +failpoints = ["fail/failpoints"] + +[dependencies] +byteorder = "1.2" +crossbeam-skiplist = "0.1" +dashmap = "5.1" +fail = "0.5" +futures = { version = "0.3" } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +lazy_static = "1.0" +online_config = { workspace = true } +pd_client = { workspace = true } +pin-project = "1.0" +prometheus = { version = "0.13", features = ["nightly"] } +protobuf = { version = "2.8", features = ["bytes"] } +serde = { version = "1.0", features = ["derive"] } +slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +test_pd = { workspace = true } +test_pd_client = { workspace = true } +tikv_util = { workspace = true } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs new file mode 100644 index 00000000000..8027a27b394 --- /dev/null +++ b/components/resource_control/src/future.rs @@ -0,0 +1,46 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + future::Future, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use pin_project::pin_project; +use tikv_util::time::Instant; + +use crate::resource_group::{ResourceConsumeType, ResourceController}; + +#[pin_project] +pub struct ControlledFuture { + #[pin] + future: F, + controller: Arc, + group_name: Vec, +} + +impl ControlledFuture { + pub fn new(future: F, controller: Arc, group_name: Vec) -> Self { + Self { + future, + controller, + group_name, + } + } +} + +impl Future for ControlledFuture { + type Output = F::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + let now = Instant::now(); + let res = this.future.poll(cx); + this.controller.consume( + this.group_name, + ResourceConsumeType::CpuTime(now.saturating_elapsed()), + ); + res + } +} diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs new file mode 100644 index 00000000000..5534ed2153d --- /dev/null +++ b/components/resource_control/src/lib.rs @@ -0,0 +1,23 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use online_config::OnlineConfig; +use serde::{Deserialize, Serialize}; + +mod resource_group; +pub use resource_group::{ + ResourceConsumeType, ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL, +}; + +mod future; +pub use future::ControlledFuture; + +mod service; +pub use service::ResourceManagerService; + +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig, Default)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct Config { + #[online_config(skip)] + pub enabled: bool, +} diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs new file mode 100644 index 00000000000..c5112c13516 --- /dev/null +++ b/components/resource_control/src/resource_group.rs @@ -0,0 +1,490 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use dashmap::{mapref::one::Ref, DashMap}; +use kvproto::{ + kvrpcpb::CommandPri, + resource_manager::{GroupMode, ResourceGroup}, +}; +use yatp::queue::priority::TaskPriorityProvider; + +// a read task cost at least 50us. +const DEFAULT_PRIORITY_PER_READ_TASK: u64 = 50; +// extra task schedule factor +const TASK_EXTRA_FACTOR_BY_LEVEL: [u64; 3] = [0, 20, 100]; +/// duration to update the minimal priority value of each resource group. +pub const MIN_PRIORITY_UPDATE_INTERVAL: Duration = Duration::from_secs(1); +/// default resource group name +const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; +/// default value of max RU quota. +const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; + +pub enum ResourceConsumeType { + CpuTime(Duration), + IoBytes(u64), +} + +/// ResourceGroupManager manages the metadata of each resource group. +#[derive(Default)] +pub struct ResourceGroupManager { + resource_groups: DashMap, + registry: Mutex>>, +} + +impl ResourceGroupManager { + fn get_ru_setting(rg: &ResourceGroup, is_read: bool) -> u64 { + match (rg.get_mode(), is_read) { + // RU mode, read and write use the same setting. + (GroupMode::RuMode, _) => rg + .get_r_u_settings() + .get_r_u() + .get_settings() + .get_fill_rate(), + // TODO: currently we only consider the cpu usage in the read path, we may also take + // io read bytes into account later. + (GroupMode::RawMode, true) => rg + .get_raw_resource_settings() + .get_cpu() + .get_settings() + .get_fill_rate(), + (GroupMode::RawMode, false) => rg + .get_raw_resource_settings() + .get_io_write() + .get_settings() + .get_fill_rate(), + // return a default value for unsupported config. + (GroupMode::Unknown, _) => 1, + } + } + + pub fn add_resource_group(&self, rg: ResourceGroup) { + let group_name = rg.get_name().to_ascii_lowercase(); + self.registry.lock().unwrap().iter().for_each(|controller| { + let ru_quota = Self::get_ru_setting(&rg, controller.is_read); + controller.add_resource_group(group_name.clone().into_bytes(), ru_quota); + }); + self.resource_groups.insert(group_name, rg); + } + + pub fn remove_resource_group(&self, name: &str) { + let group_name = name.to_ascii_lowercase(); + self.registry.lock().unwrap().iter().for_each(|controller| { + controller.remove_resource_group(group_name.as_bytes()); + }); + self.resource_groups.remove(&group_name); + } + + pub fn get_resource_group(&self, name: &str) -> Option> { + self.resource_groups.get(&name.to_ascii_lowercase()) + } + + pub fn get_all_resource_groups(&self) -> Vec { + self.resource_groups.iter().map(|g| g.clone()).collect() + } + + pub fn derive_controller(&self, name: String, is_read: bool) -> Arc { + let controller = Arc::new(ResourceController::new(name, is_read)); + self.registry.lock().unwrap().push(controller.clone()); + for g in &self.resource_groups { + let ru_quota = Self::get_ru_setting(g.value(), controller.is_read); + controller.add_resource_group(g.key().clone().into_bytes(), ru_quota); + } + controller + } + + pub fn advance_min_virtual_time(&self) { + for controller in self.registry.lock().unwrap().iter() { + controller.update_min_virtual_time(); + } + } +} + +pub struct ResourceController { + // resource controller name is not used currently. + #[allow(dead_code)] + name: String, + // We handle the priority differently between read and write request: + // 1. the priority factor is calculate based on read/write RU settings. + // 2. for read request, we increase a constant virtual time delta at each `get_priority` call + // because the cost can't be calculated at start, so we only increase a constant delta and + // increase the real cost after task is executed; but don't increase it at write because + // the cost is known so we just pre-consume it. + is_read: bool, + // Track the maximum ru quota used to calculate the factor of each resource group. + // factor = max_ru_quota / group_ru_quota * 10.0 + // We use mutex here to ensure when we need to change this value and do adjust all resource + // groups' factors, it can't be changed concurrently. + max_ru_quota: Mutex, + // record consumption of each resource group, name --> resource_group + resource_consumptions: DashMap, GroupPriorityTracker>, + + last_min_vt: AtomicU64, +} + +impl ResourceController { + pub fn new(name: String, is_read: bool) -> Self { + let controller = Self { + name, + is_read, + max_ru_quota: Mutex::new(DEFAULT_MAX_RU_QUOTA), + resource_consumptions: DashMap::new(), + last_min_vt: AtomicU64::new(0), + }; + // add the "default" resource group + controller.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + controller + } + + fn calculate_factor(max_quota: u64, quota: u64) -> u64 { + if quota > 0 { + // we use max_quota / quota as the resource group factor, but because we need to + // cast the value to integer, so we times it by 10 to ensure the accuracy is + // enough. + (max_quota as f64 / quota as f64 * 10.0).round() as u64 + } else { + 1 + } + } + + fn add_resource_group(&self, name: Vec, ru_quota: u64) { + let mut max_ru_quota = self.max_ru_quota.lock().unwrap(); + if ru_quota > *max_ru_quota { + *max_ru_quota = ru_quota; + // adjust all group weight because the current value is too small. + self.adjust_all_resource_group_factors(ru_quota); + } + let weight = Self::calculate_factor(*max_ru_quota, ru_quota); + + let vt_delta_for_get = if self.is_read { + DEFAULT_PRIORITY_PER_READ_TASK * weight + } else { + 0 + }; + let group = GroupPriorityTracker { + ru_quota, + weight, + virtual_time: AtomicU64::new(self.last_min_vt.load(Ordering::Acquire)), + vt_delta_for_get, + }; + // maybe update existed group + self.resource_consumptions.insert(name, group); + } + + // we calculate the weight of each resource group based on the currently maximum + // ru quota, if a incoming resource group has a bigger quota, we need to + // adjust all the existing groups. As we expect this won't happen very + // often, and iterate 10k entry cost less than 5ms, so the performance is + // acceptable. + fn adjust_all_resource_group_factors(&self, max_ru_quota: u64) { + self.resource_consumptions.iter_mut().for_each(|mut g| { + g.value_mut().weight = Self::calculate_factor(max_ru_quota, g.ru_quota); + }); + } + + fn remove_resource_group(&self, name: &[u8]) { + // do not remove the default resource group, reset to default setting instead. + if DEFAULT_RESOURCE_GROUP_NAME.as_bytes() == name { + self.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + } + self.resource_consumptions.remove(name); + } + + #[inline] + fn resource_group(&self, name: &[u8]) -> Ref<'_, Vec, GroupPriorityTracker> { + if let Some(g) = self.resource_consumptions.get(name) { + g + } else { + self.resource_consumptions + .get(DEFAULT_RESOURCE_GROUP_NAME.as_bytes()) + .unwrap() + } + } + + pub fn consume(&self, name: &[u8], delta: ResourceConsumeType) { + self.resource_group(name).consume(delta) + } + + pub fn update_min_virtual_time(&self) { + let mut min_vt = u64::MAX; + let mut max_vt = 0; + self.resource_consumptions.iter().for_each(|g| { + let vt = g.current_vt(); + if min_vt > vt { + min_vt = vt; + } + if max_vt < vt { + max_vt = vt; + } + }); + + // TODO: use different threshold for different resource type + // needn't do update if the virtual different is less than 100ms/100KB. + if min_vt + 100_000 >= max_vt { + return; + } + + self.resource_consumptions.iter().for_each(|g| { + let vt = g.current_vt(); + if vt < max_vt { + // TODO: is increase by half is a good choice. + g.increase_vt((max_vt - vt) / 2); + } + }); + // max_vt is actually a little bigger than the current min vt, but we don't + // need totally accurate here. + self.last_min_vt.store(max_vt, Ordering::Relaxed); + } + + pub fn get_priority(&self, name: &[u8], pri: CommandPri) -> u64 { + let level = match pri { + CommandPri::Low => 2, + CommandPri::Normal => 1, + CommandPri::High => 0, + }; + self.resource_group(name).get_priority(level) + } +} + +impl TaskPriorityProvider for ResourceController { + fn priority_of(&self, extras: &yatp::queue::Extras) -> u64 { + self.resource_group(extras.metadata()) + .get_priority(extras.current_level() as usize) + } +} + +struct GroupPriorityTracker { + // the ru setting of this group. + ru_quota: u64, + weight: u64, + virtual_time: AtomicU64, + // the constant delta value for each `get_priority` call, + vt_delta_for_get: u64, +} + +impl GroupPriorityTracker { + fn get_priority(&self, level: usize) -> u64 { + let task_extra_priority = TASK_EXTRA_FACTOR_BY_LEVEL[level] * 1000 * self.weight; + (if self.vt_delta_for_get > 0 { + self.virtual_time + .fetch_add(self.vt_delta_for_get, Ordering::Relaxed) + + self.vt_delta_for_get + } else { + self.virtual_time.load(Ordering::Relaxed) + }) + task_extra_priority + } + + #[inline] + fn current_vt(&self) -> u64 { + self.virtual_time.load(Ordering::Relaxed) + } + + #[inline] + fn increase_vt(&self, vt_delta: u64) { + self.virtual_time.fetch_add(vt_delta, Ordering::Relaxed); + } + + // TODO: make it delta type as generic to avoid mixed consume different types. + #[inline] + fn consume(&self, delta: ResourceConsumeType) { + let vt_delta = match delta { + ResourceConsumeType::CpuTime(dur) => dur.as_micros() as u64, + ResourceConsumeType::IoBytes(bytes) => bytes, + } * self.weight; + self.increase_vt(vt_delta); + } +} + +#[cfg(test)] +pub(crate) mod tests { + use yatp::queue::Extras; + + use super::*; + + pub fn new_resource_group_ru(name: String, ru: u64) -> ResourceGroup { + new_resource_group(name, true, ru, ru) + } + + pub fn new_resource_group( + name: String, + is_ru_mode: bool, + read_tokens: u64, + write_tokens: u64, + ) -> ResourceGroup { + use kvproto::resource_manager::{GroupRawResourceSettings, GroupRequestUnitSettings}; + + let mut group = ResourceGroup::new(); + group.set_name(name); + let mode = if is_ru_mode { + GroupMode::RuMode + } else { + GroupMode::RawMode + }; + group.set_mode(mode); + if is_ru_mode { + assert!(read_tokens == write_tokens); + let mut ru_setting = GroupRequestUnitSettings::new(); + ru_setting + .mut_r_u() + .mut_settings() + .set_fill_rate(read_tokens); + group.set_r_u_settings(ru_setting); + } else { + let mut resource_setting = GroupRawResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_raw_resource_settings(resource_setting); + } + group + } + + #[test] + fn test_resource_group() { + let resource_manager = ResourceGroupManager::default(); + + let group1 = new_resource_group_ru("TEST".into(), 100); + resource_manager.add_resource_group(group1); + + assert!(resource_manager.get_resource_group("test1").is_none()); + let group = resource_manager.get_resource_group("test").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_u() + .get_settings() + .get_fill_rate(), + 100 + ); + drop(group); + assert_eq!(resource_manager.resource_groups.len(), 1); + + let group1 = new_resource_group_ru("Test".into(), 200); + resource_manager.add_resource_group(group1); + let group = resource_manager.get_resource_group("test").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_u() + .get_settings() + .get_fill_rate(), + 200 + ); + drop(group); + assert_eq!(resource_manager.resource_groups.len(), 1); + + let group2 = new_resource_group_ru("test2".into(), 400); + resource_manager.add_resource_group(group2); + assert_eq!(resource_manager.resource_groups.len(), 2); + + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + assert_eq!(resource_ctl.resource_consumptions.len(), 3); + + let group1 = resource_ctl.resource_group("test".as_bytes()); + assert_eq!(group1.weight, 500); + let group2 = resource_ctl.resource_group("test2".as_bytes()); + assert_eq!(group2.weight, 250); + assert_eq!(group1.current_vt(), 0); + + let mut extras1 = Extras::single_level(); + extras1.set_metadata("test".as_bytes().to_owned()); + assert_eq!(resource_ctl.priority_of(&extras1), 25_000); + assert_eq!(group1.current_vt(), 25_000); + + let mut extras2 = Extras::single_level(); + extras2.set_metadata("test2".as_bytes().to_owned()); + assert_eq!(resource_ctl.priority_of(&extras2), 12_500); + assert_eq!(group2.current_vt(), 12_500); + + let mut extras3 = Extras::single_level(); + extras3.set_metadata("unknown_group".as_bytes().to_owned()); + assert_eq!(resource_ctl.priority_of(&extras3), 50); + assert_eq!( + resource_ctl + .resource_group("default".as_bytes()) + .current_vt(), + 50 + ); + + resource_ctl.consume( + "test".as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + resource_ctl.consume( + "test2".as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + + assert_eq!(group1.current_vt(), 5_025_000); + assert_eq!(group1.current_vt(), group2.current_vt() * 2); + + // test update all group vts + resource_manager.advance_min_virtual_time(); + let group1_vt = group1.current_vt(); + assert_eq!(group1_vt, 5_025_000); + assert!(group2.current_vt() >= group1.current_vt() * 3 / 4); + assert!( + resource_ctl + .resource_group("default".as_bytes()) + .current_vt() + >= group1.current_vt() / 2 + ); + + drop(group1); + drop(group2); + + // test add 1 new resource group + let new_group = new_resource_group_ru("new_group".into(), 500); + resource_manager.add_resource_group(new_group); + + assert_eq!(resource_ctl.resource_consumptions.len(), 4); + let group3 = resource_ctl.resource_group("new_group".as_bytes()); + assert_eq!(group3.weight, 200); + assert!(group3.current_vt() >= group1_vt / 2); + } + + #[test] + fn test_adjust_resource_group_weight() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); + + let group1 = new_resource_group_ru("test1".into(), 5000); + resource_manager.add_resource_group(group1); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 20 + ); + + // add a resource group with big ru + let group1 = new_resource_group_ru("test2".into(), 50000); + resource_manager.add_resource_group(group1); + assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); + assert_eq!(resource_ctl.resource_group("test2".as_bytes()).weight, 10); + // resource_ctl_write should be unchanged. + assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 50000); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + assert_eq!( + resource_ctl_write.resource_group("test2".as_bytes()).weight, + 10 + ); + } +} diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs new file mode 100644 index 00000000000..fc24af4fdc4 --- /dev/null +++ b/components/resource_control/src/service.rs @@ -0,0 +1,278 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, time::Duration}; + +use futures::{compat::Future01CompatExt, StreamExt}; +use kvproto::{pdpb::EventType, resource_manager::ResourceGroup}; +use pd_client::{Error as PdError, PdClient, RpcClient, RESOURCE_CONTROL_CONFIG_PATH}; +use tikv_util::{error, timer::GLOBAL_TIMER_HANDLE}; + +use crate::ResourceGroupManager; + +#[derive(Clone)] +pub struct ResourceManagerService { + manager: Arc, + pd_client: Arc, + // record watch revision + revision: i64, +} + +impl ResourceManagerService { + /// Constructs a new `Service` with `ResourceGroupManager` and a `RpcClient` + pub fn new( + manager: Arc, + pd_client: Arc, + ) -> ResourceManagerService { + ResourceManagerService { + pd_client, + manager, + revision: 0, + } + } +} + +const RETRY_INTERVAL: Duration = Duration::from_secs(1); // to consistent with pd_client + +impl ResourceManagerService { + pub async fn watch_resource_groups(&mut self) { + // Firstly, load all resource groups as of now. + let (groups, revision) = self.list_resource_groups().await; + self.revision = revision; + groups + .into_iter() + .for_each(|rg| self.manager.add_resource_group(rg)); + // Secondly, start watcher at loading revision. + loop { + match self + .pd_client + .watch_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), self.revision) + { + Ok(mut stream) => { + while let Some(grpc_response) = stream.next().await { + match grpc_response { + Ok(r) => { + self.revision = r.get_revision(); + r.get_changes() + .iter() + .for_each(|item| match item.get_kind() { + EventType::Put => { + if let Ok(group) = + protobuf::parse_from_bytes::( + item.get_payload(), + ) + { + self.manager.add_resource_group(group); + } + } + EventType::Delete => { + self.manager.remove_resource_group(item.get_name()); + } + }); + } + Err(err) => { + error!("failed to get stream"; "err" => ?err); + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; + } + } + } + } + Err(PdError::DataCompacted(msg)) => { + error!("required revision has been compacted"; "err" => ?msg); + // If the etcd revision is compacted, we need to reload all resouce groups. + let (groups, revision) = self.list_resource_groups().await; + self.revision = revision; + groups + .into_iter() + .for_each(|rg| self.manager.add_resource_group(rg)); + } + Err(err) => { + error!("failed to watch resource groups"; "err" => ?err); + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; + } + } + } + } + + async fn list_resource_groups(&mut self) -> (Vec, i64) { + loop { + match self + .pd_client + .load_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string()) + .await + { + Ok((items, revision)) => { + let groups = items + .into_iter() + .filter_map(|g| protobuf::parse_from_bytes(g.get_payload()).ok()) + .collect(); + return (groups, revision); + } + Err(err) => { + error!("failed to load global config"; "err" => ?err); + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; + } + } + } + } +} + +#[cfg(test)] +pub mod tests { + use std::time::Duration; + + use futures::executor::block_on; + use kvproto::pdpb::GlobalConfigItem; + use pd_client::RpcClient; + use protobuf::Message; + use test_pd::{mocker::Service, util::*, Server as MockServer}; + use tikv_util::{config::ReadableDuration, worker::Builder}; + + use crate::resource_group::tests::{new_resource_group, new_resource_group_ru}; + + fn new_test_server_and_client( + update_interval: ReadableDuration, + ) -> (MockServer, RpcClient) { + let server = MockServer::new(1); + let eps = server.bind_addrs(); + let client = new_client_with_update_interval(eps, None, update_interval); + (server, client) + } + + fn add_resource_group(pd_client: Arc, group: ResourceGroup) { + let mut item = GlobalConfigItem::default(); + item.set_kind(EventType::Put); + item.set_name(group.get_name().to_string()); + let mut buf = Vec::new(); + group.write_to_vec(&mut buf).unwrap(); + item.set_payload(buf); + + futures::executor::block_on(async move { + pd_client + .store_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), vec![item]) + .await + }) + .unwrap(); + } + + fn delete_resource_group(pd_client: Arc, name: &str) { + let mut item = GlobalConfigItem::default(); + item.set_kind(EventType::Delete); + item.set_name(name.to_string()); + + futures::executor::block_on(async move { + pd_client + .store_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), vec![item]) + .await + }) + .unwrap(); + } + + use super::*; + #[test] + fn crud_config_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let group = new_resource_group("TEST".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 1); + assert_eq!(revision, 1); + + delete_resource_group(s.pd_client.clone(), "TEST"); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 0); + assert_eq!(revision, 2); + + server.stop(); + } + + #[test] + fn watch_config_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 0); + assert_eq!(revision, 0); + + let background_worker = Builder::new("background").thread_count(1).create(); + let mut s_clone = s.clone(); + background_worker.spawn_async_task(async move { + s_clone.watch_resource_groups().await; + }); + // Mock add + let group1 = new_resource_group_ru("TEST1".into(), 100); + add_resource_group(s.pd_client.clone(), group1); + let group2 = new_resource_group_ru("TEST2".into(), 100); + add_resource_group(s.pd_client.clone(), group2); + // Mock modify + let group2 = new_resource_group_ru("TEST2".into(), 50); + add_resource_group(s.pd_client.clone(), group2); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 2); + assert_eq!(revision, 3); + // Mock delete + delete_resource_group(s.pd_client.clone(), "TEST1"); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 1); + assert_eq!(revision, 4); + // Wait for watcher + std::thread::sleep(Duration::from_millis(100)); + let groups = s.manager.get_all_resource_groups(); + assert_eq!(groups.len(), 1); + assert!(s.manager.get_resource_group("TEST1").is_none()); + let group = s.manager.get_resource_group("TEST2").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_u() + .get_settings() + .get_fill_rate(), + 50 + ); + server.stop(); + } + + #[test] + fn reboot_watch_server_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let background_worker = Builder::new("background").thread_count(1).create(); + let mut s_clone = s.clone(); + background_worker.spawn_async_task(async move { + s_clone.watch_resource_groups().await; + }); + // Mock add + let group1 = new_resource_group_ru("TEST1".into(), 100); + add_resource_group(s.pd_client.clone(), group1); + // Mock reboot watch server + let watch_global_config_fp = "watch_global_config_return"; + fail::cfg(watch_global_config_fp, "return").unwrap(); + std::thread::sleep(Duration::from_millis(100)); + fail::remove(watch_global_config_fp); + // Mock add after rebooting will success + let group1 = new_resource_group_ru("TEST2".into(), 100); + add_resource_group(s.pd_client.clone(), group1); + // Wait watcher update + std::thread::sleep(Duration::from_secs(1)); + let groups = s.manager.get_all_resource_groups(); + assert_eq!(groups.len(), 2); + + server.stop(); + } +} diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index 52f438236fd..68328c01ebe 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -190,6 +190,10 @@ impl SecurityManager { ) } } + + pub fn get_config(&self) -> &SecurityConfig { + &self.cfg + } } #[derive(Clone)] diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index acdca09b29c..d5e2f177b5e 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -69,6 +69,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } rand = "0.8" resolved_ts = { workspace = true } +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true } serde_json = "1.0" diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index d0637a04b0a..bfaa2a6587e 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -161,7 +161,7 @@ fn run_dump_raftdb_worker( // Assume that we always scan entry first and raft state at the // end. batch - .append(region_id, std::mem::take(&mut entries)) + .append(region_id, None, std::mem::take(&mut entries)) .unwrap(); } _ => unreachable!("There is only 2 types of keys in raft"), @@ -170,7 +170,7 @@ fn run_dump_raftdb_worker( if local_size >= BATCH_THRESHOLD { local_size = 0; batch - .append(region_id, std::mem::take(&mut entries)) + .append(region_id, None, std::mem::take(&mut entries)) .unwrap(); let size = new_engine.consume(&mut batch, false).unwrap(); @@ -205,7 +205,7 @@ fn run_dump_raft_engine_worker( begin += old_engine .fetch_entries_to(id, begin, end, Some(BATCH_THRESHOLD), &mut entries) .unwrap() as u64; - batch.append(id, entries).unwrap(); + batch.append(id, None, entries).unwrap(); let size = new_engine.consume(&mut batch, false).unwrap(); count_size.fetch_add(size, Ordering::Relaxed); } @@ -303,7 +303,7 @@ mod tests { e.set_index(i); entries.push(e); } - batch.append(num, entries).unwrap(); + batch.append(num, None, entries).unwrap(); } // Get data from raft engine and assert. diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 3c926969ce2..2a479964ced 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -82,6 +82,9 @@ use raftstore::{ }, RaftRouterCompactedEventSender, }; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; use security::SecurityManager; use snap_recovery::RecoveryService; use tikv::{ @@ -244,6 +247,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Option>, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, br_snap_recovery_mode: bool, // use for br snapshot recovery @@ -321,14 +325,33 @@ where let store_path = Path::new(&config.storage.data_dir).to_owned(); - // Initialize raftstore channels. - let (router, system) = fsm::create_raft_batch_system(&config.raft_store); - let thread_count = config.server.background_thread_count; let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); + // spawn a task to periodically update the minimal virtual time of all resource + // groups. + let resource_mgr = mgr.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; + }); + Some(mgr) + } else { + None + }; + + // Initialize raftstore channels. + let (router, system) = fsm::create_raft_batch_system(&config.raft_store, &resource_manager); + let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), config.coprocessor.clone(), @@ -398,6 +421,7 @@ where flow_info_receiver: None, sst_worker: None, quota_limiter, + resource_manager, causal_ts_provider, tablet_registry: None, br_snap_recovery_mode: is_recovering_marked, @@ -733,10 +757,15 @@ where } let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + resource_ctl, )) } else { None @@ -810,6 +839,9 @@ where Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), self.causal_ts_provider.clone(), + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( @@ -998,13 +1030,7 @@ where ConnectionConfig { keep_alive_interval: self.config.server.grpc_keepalive_time.0, keep_alive_timeout: self.config.server.grpc_keepalive_timeout.0, - tls: self - .security_mgr - .client_suite() - .map_err(|err| { - warn!("Failed to load client TLS suite, ignoring TLS config."; "err" => %err); - }) - .ok(), + tls: Arc::clone(&self.security_mgr), }, ); let backup_stream_endpoint = backup_stream::Endpoint::new( diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 5beddf60151..f193e1c7445 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -47,7 +47,10 @@ use file_system::{ use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; -use kvproto::{deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion}; +use kvproto::{ + deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion, + resource_usage_agent::create_resource_metering_pub_sub, +}; use pd_client::{PdClient, RpcClient}; use raft_log_engine::RaftLogEngine; use raftstore::{ @@ -56,17 +59,21 @@ use raftstore::{ RawConsistencyCheckObserver, }, store::{ - memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, CheckLeaderRunner, SplitConfigManager, - TabletSnapManager, + memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, AutoSplitController, CheckLeaderRunner, + SplitConfigManager, TabletSnapManager, }, RegionInfoAccessor, }; +use raftstore_v2::{router::RaftRouter, StateStorage}; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, - read_pool::{build_yatp_read_pool, ReadPool}, + read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, server::{ config::{Config as ServerConfig, ServerConfigManager}, gc_worker::{AutoGcConfig, GcWorker}, @@ -136,8 +143,7 @@ fn run_impl(config: TikvConfig) { tikv.init_encryption(); let fetcher = tikv.init_io_utility(); let listener = tikv.init_flow_receiver(); - let (raft_engine, engines_info) = tikv.init_raw_engines(listener); - tikv.init_engines(raft_engine); + let engines_info = tikv.init_engines(listener); let server_config = tikv.init_servers::(); tikv.register_services(); tikv.init_metrics_flusher(fetcher, engines_info); @@ -201,6 +207,7 @@ struct TikvServer { pd_client: Arc, flow_info_sender: Option>, flow_info_receiver: Option>, + router: Option>, node: Option>, resolver: Option, store_path: PathBuf, @@ -220,6 +227,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Option>, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, } @@ -232,6 +240,7 @@ struct TikvEngines { struct Servers { lock_mgr: LockManager, server: LocalServer, + rsmeter_pubsub_service: resource_metering::PubSubService, } type LocalServer = Server>; @@ -285,6 +294,25 @@ where config.quota.enable_auto_tune, )); + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); + // spawn a task to periodically update the minimal virtual time of all resource + // groups. + let resource_mgr = mgr.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; + }); + Some(mgr) + } else { + None + }; + let mut causal_ts_provider = None; if let ApiVersion::V2 = F::TAG { let tso = block_on(causal_ts::BatchTsoProvider::new_opt( @@ -310,6 +338,7 @@ where cfg_controller: Some(cfg_controller), security_mgr, pd_client, + router: None, node: None, resolver: None, store_path, @@ -331,6 +360,7 @@ where flow_info_receiver: None, sst_worker: None, quota_limiter, + resource_manager, causal_ts_provider, tablet_registry: None, } @@ -567,36 +597,6 @@ where engine_rocks::FlowListener::new(tx) } - fn init_engines(&mut self, raft_engine: ER) { - let tablet_registry = self.tablet_registry.clone().unwrap(); - let mut node = NodeV2::new( - &self.config.server, - self.pd_client.clone(), - None, - tablet_registry, - ); - node.try_bootstrap_store(&self.config.raft_store, &raft_engine) - .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); - assert_ne!(node.id(), 0); - - let router = node.router(); - let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( - router.store_router().clone(), - self.config.coprocessor.clone(), - ); - let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); - - let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); - - self.engines = Some(TikvEngines { - raft_engine, - engine, - }); - self.node = Some(node); - self.coprocessor_host = Some(coprocessor_host); - self.region_info_accessor = Some(region_info_accessor); - } - fn init_gc_worker(&mut self) -> GcWorker> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( @@ -647,13 +647,21 @@ where let engines = self.engines.as_ref().unwrap(); let pd_worker = LazyWorker::new("pd-worker"); - let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); + let pd_sender = raftstore_v2::PdReporter::new( + pd_worker.scheduler(), + slog_global::borrow_global().new(slog::o!()), + ); let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + resource_ctl, )) } else { None @@ -681,15 +689,16 @@ where let (reporter_notifier, data_sink_reg_handle, reporter_worker) = resource_metering::init_reporter( self.config.resource_metering.clone(), - collector_reg_handle, + collector_reg_handle.clone(), ); self.to_stop.push(reporter_worker); let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( self.config.resource_metering.receiver_address.clone(), self.env.clone(), - data_sink_reg_handle, + data_sink_reg_handle.clone(), ); self.to_stop.push(single_target_worker); + let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); let cfg_manager = resource_metering::ConfigManager::new( self.config.resource_metering.clone(), @@ -726,6 +735,9 @@ where Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), self.causal_ts_provider.clone(), + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( @@ -773,8 +785,24 @@ where cop_read_pools.handle() }; + let mut unified_read_pool_scale_receiver = None; + if self.config.readpool.is_unified_pool_enabled() { + let (unified_read_pool_scale_notifier, rx) = mpsc::sync_channel(10); + cfg_controller.register( + tikv::config::Module::Readpool, + Box::new(ReadPoolConfigManager::new( + unified_read_pool.as_ref().unwrap().handle(), + unified_read_pool_scale_notifier, + &self.background_worker, + self.config.readpool.unified.max_thread_count, + self.config.readpool.unified.auto_adjust_pool_size, + )), + ); + unified_read_pool_scale_receiver = Some(rx); + } + let check_leader_runner = CheckLeaderRunner::new( - self.node.as_ref().unwrap().router().store_meta().clone(), + self.router.as_ref().unwrap().store_meta().clone(), self.coprocessor_host.clone().unwrap(), ); let check_leader_scheduler = self @@ -832,7 +860,17 @@ where let split_config_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); - cfg_controller.register(tikv::config::Module::Split, Box::new(split_config_manager)); + cfg_controller.register( + tikv::config::Module::Split, + Box::new(split_config_manager.clone()), + ); + + let auto_split_controller = AutoSplitController::new( + split_config_manager, + self.config.server.grpc_concurrency, + self.config.readpool.unified.max_thread_count, + unified_read_pool_scale_receiver, + ); // `ConsistencyCheckObserver` must be registered before `Node::start`. let safe_point = Arc::new(AtomicU64::new(0)); @@ -855,11 +893,15 @@ where .unwrap() .start( engines.raft_engine.clone(), + self.tablet_registry.clone().unwrap(), + self.router.as_ref().unwrap(), server.transport(), snap_mgr, self.concurrency_manager.clone(), self.causal_ts_provider.clone(), self.coprocessor_host.clone().unwrap(), + auto_split_controller, + collector_reg_handle, self.background_worker.clone(), pd_worker, raft_store, @@ -884,7 +926,11 @@ where initial_metric(&self.config.metric); - self.servers = Some(Servers { lock_mgr, server }); + self.servers = Some(Servers { + lock_mgr, + server, + rsmeter_pubsub_service, + }); server_config } @@ -925,6 +971,16 @@ where &self.config.pessimistic_txn, ) .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); + + if servers + .server + .register_service(create_resource_metering_pub_sub( + servers.rsmeter_pubsub_service.clone(), + )) + .is_some() + { + warn!("failed to register resource metering pubsub service"); + } } fn init_io_utility(&mut self) -> BytesFetcher { @@ -1392,10 +1448,10 @@ impl ConfiguredRaftEngine for RaftLogEngine { } impl TikvServer { - fn init_raw_engines( + fn init_engines( &mut self, flow_listener: engine_rocks::FlowListener, - ) -> (CER, Arc) { + ) -> Arc { let block_cache = self.config.storage.block_cache.build_shared_cache(); let env = self .config @@ -1415,6 +1471,19 @@ impl TikvServer { let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); + + let mut node = NodeV2::new(&self.config.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&self.config.raft_store, &raft_engine) + .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); + assert_ne!(node.id(), 0); + + let router = node.router().clone(); + + // Create kv engine. + let builder = builder.state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + router.clone(), + ))); let factory = Box::new(builder.build()); self.kv_statistics = Some(factory.rocks_statistics()); let registry = TabletRegistry::new(factory, self.store_path.join("tablets")) @@ -1433,7 +1502,25 @@ impl TikvServer { 180, // max_samples_to_preserve )); - (raft_engine, engines_info) + let router = RaftRouter::new(node.id(), router); + let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( + router.store_router().clone(), + self.config.coprocessor.clone(), + ); + let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + + let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + + self.engines = Some(TikvEngines { + raft_engine, + engine, + }); + self.router = Some(router); + self.node = Some(node); + self.coprocessor_host = Some(coprocessor_host); + self.region_info_accessor = Some(region_info_accessor); + + engines_info } } diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index f766729a066..84d2f67bbab 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -15,6 +15,7 @@ use engine_traits::{ iter_option, EncryptionKeyManager, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader, }; use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; +use keys::data_key; use kvproto::{import_sstpb::*, kvrpcpb::ApiVersion}; use tikv_util::time::Instant; use uuid::{Builder as UuidBuilder, Uuid}; @@ -336,7 +337,7 @@ impl ImportDir { let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; for &(start, end) in TIDB_RANGES_COMPLEMENT { - let opt = iter_option(start, end, false); + let opt = iter_option(&data_key(start), &data_key(end), false); let mut iter = sst_reader.iter(opt)?; if iter.seek(start)? { error!( diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 3e06eb76899..fabe9e2a13a 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -32,7 +32,10 @@ use kvproto::{ kvrpcpb::ApiVersion, }; use tikv_util::{ - codec::stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + codec::{ + bytes::{decode_bytes_in_place, encode_bytes}, + stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + }, config::ReadableSize, stream::block_on_external_io, sys::SysQuota, @@ -53,13 +56,18 @@ use crate::{ #[derive(Default, Debug, Clone)] pub struct DownloadExt<'a> { cache_key: Option<&'a str>, + req_type: DownloadRequestType, } impl<'a> DownloadExt<'a> { - pub fn cache_key(self, key: &'a str) -> Self { - Self { - cache_key: Some(key), - } + pub fn cache_key(mut self, key: &'a str) -> Self { + self.cache_key = Some(key); + self + } + + pub fn req_type(mut self, req_type: DownloadRequestType) -> Self { + self.req_type = req_type; + self } } @@ -763,7 +771,7 @@ impl SstImporter { start_ts: u64, restore_ts: u64, file_buff: Arc>, - build_fn: &mut dyn FnMut(Vec, Vec), + mut build_fn: impl FnMut(Vec, Vec), ) -> Result> { let mut event_iter = EventIterator::new(file_buff.as_slice()); let mut smallest_key = None; @@ -896,16 +904,20 @@ impl SstImporter { let sst_reader = RocksSstReader::open_with_env(dst_file_name, Some(env))?; sst_reader.verify_checksum()?; + // undo key rewrite so we could compare with the keys inside SST + let old_prefix = rewrite_rule.get_old_key_prefix(); + let new_prefix = rewrite_rule.get_new_key_prefix(); + let req_type = ext.req_type; + debug!("downloaded file and verified"; "meta" => ?meta, "name" => name, "path" => dst_file_name, + "old_prefix" => log_wrappers::Value::key(old_prefix), + "new_prefix" => log_wrappers::Value::key(new_prefix), + "req_type" => ?req_type, ); - // undo key rewrite so we could compare with the keys inside SST - let old_prefix = rewrite_rule.get_old_key_prefix(); - let new_prefix = rewrite_rule.get_new_key_prefix(); - let range_start = meta.get_range().get_start(); let range_end = meta.get_range().get_end(); let range_start_bound = key_to_bound(range_start); @@ -915,14 +927,14 @@ impl SstImporter { key_to_bound(range_end) }; - let range_start = + let mut range_start = keys::rewrite::rewrite_prefix_of_start_bound(new_prefix, old_prefix, range_start_bound) .map_err(|_| Error::WrongKeyPrefix { what: "SST start range", key: range_start.to_vec(), prefix: new_prefix.to_vec(), })?; - let range_end = + let mut range_end = keys::rewrite::rewrite_prefix_of_end_bound(new_prefix, old_prefix, range_end_bound) .map_err(|_| Error::WrongKeyPrefix { what: "SST end range", @@ -930,6 +942,11 @@ impl SstImporter { prefix: new_prefix.to_vec(), })?; + if req_type == DownloadRequestType::Keyspace { + range_start = keys::rewrite::encode_bound(range_start); + range_end = keys::rewrite::encode_bound(range_end); + } + let start_rename_rewrite = Instant::now(); // read the first and last keys from the SST, determine if we could // simply move the entire SST instead of iterating and generate a new one. @@ -942,9 +959,15 @@ impl SstImporter { return Ok(None); } if !iter.seek_to_first()? { + let mut range = meta.get_range().clone(); + if req_type == DownloadRequestType::Keyspace { + *range.mut_start() = encode_bytes(&range.take_start()); + *range.mut_end() = encode_bytes(&range.take_end()); + } // the SST is empty, so no need to iterate at all (should be impossible?) - return Ok(Some(meta.get_range().clone())); + return Ok(Some(range)); } + let start_key = keys::origin_key(iter.key()); if is_before_start_bound(start_key, &range_start) { // SST's start is before the range to consume, so needs to iterate to skip over @@ -995,8 +1018,10 @@ impl SstImporter { } // perform iteration and key rewrite. - let mut key = keys::data_key(new_prefix); - let new_prefix_data_key_len = key.len(); + let mut data_key = keys::DATA_PREFIX_KEY.to_vec(); + let data_key_prefix_len = keys::DATA_PREFIX_KEY.len(); + let mut user_key = new_prefix.to_vec(); + let user_key_prefix_len = new_prefix.len(); let mut first_key = None; match range_start { @@ -1016,10 +1041,22 @@ impl SstImporter { .unwrap(); while iter.valid()? { - let old_key = keys::origin_key(iter.key()); - if is_after_end_bound(old_key, &range_end) { + let mut old_key = Cow::Borrowed(keys::origin_key(iter.key())); + let mut ts = None; + + if is_after_end_bound(old_key.as_ref(), &range_end) { break; } + + if req_type == DownloadRequestType::Keyspace { + ts = Some(Key::decode_ts_bytes_from(old_key.as_ref())?.to_owned()); + old_key = { + let mut key = old_key.to_vec(); + decode_bytes_in_place(&mut key, false)?; + Cow::Owned(key) + }; + } + if !old_key.starts_with(old_prefix) { return Err(Error::WrongKeyPrefix { what: "Key in SST", @@ -1027,12 +1064,21 @@ impl SstImporter { prefix: old_prefix.to_vec(), }); } - key.truncate(new_prefix_data_key_len); - key.extend_from_slice(&old_key[old_prefix.len()..]); + + data_key.truncate(data_key_prefix_len); + user_key.truncate(user_key_prefix_len); + user_key.extend_from_slice(&old_key[old_prefix.len()..]); + if req_type == DownloadRequestType::Keyspace { + data_key.extend(encode_bytes(&user_key)); + data_key.extend(ts.unwrap()); + } else { + data_key.extend_from_slice(&user_key); + } + let mut value = Cow::Borrowed(iter.value()); if rewrite_rule.new_timestamp != 0 { - key = Key::from_encoded(key) + data_key = Key::from_encoded(data_key) .truncate_ts() .map_err(|e| { Error::BadFormat(format!( @@ -1056,10 +1102,10 @@ impl SstImporter { } } - sst_writer.put(&key, &value)?; + sst_writer.put(&data_key, &value)?; iter.next()?; if first_key.is_none() { - first_key = Some(keys::origin_key(&key).to_vec()); + first_key = Some(keys::origin_key(&data_key).to_vec()); } } @@ -1078,7 +1124,7 @@ impl SstImporter { let mut final_range = Range::default(); final_range.set_start(start_key); - final_range.set_end(keys::origin_key(&key).to_vec()); + final_range.set_end(keys::origin_key(&data_key).to_vec()); Ok(Some(final_range)) } else { // nothing is written: prevents finishing the SST at all. diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index e990924c638..3409a6ef366 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -8,7 +8,7 @@ use std::{ time::Duration, }; -use api_version::{dispatch_api_version, KvFormat, RawValue}; +use api_version::{dispatch_api_version, keyspace::KvPair, ApiV1, KvFormat, RawValue}; use backup::Task; use collections::HashMap; use engine_traits::{CfName, IterOptions, CF_DEFAULT, CF_WRITE, DATA_KEY_PREFIX_LEN}; @@ -354,7 +354,7 @@ impl TestSuite { Default::default(), false, ); - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: TikvStorage::new(snap_store, false), ranges: vec![Range::Interval(IntervalRange::from((start, end)))], scan_backward_in_range: false, @@ -362,8 +362,9 @@ impl TestSuite { is_scanned_range_aware: false, }); let digest = crc64fast::Digest::new(); - while let Some((k, v)) = block_on(scanner.next()).unwrap() { - checksum = checksum_crc64_xor(checksum, digest.clone(), &k, &v); + while let Some(row) = block_on(scanner.next()).unwrap() { + let (k, v) = row.kv(); + checksum = checksum_crc64_xor(checksum, digest.clone(), k, v); total_kvs += 1; total_bytes += (k.len() + v.len()) as u64; } diff --git a/components/test_pd/Cargo.toml b/components/test_pd/Cargo.toml index a478e6ee325..6277789b194 100644 --- a/components/test_pd/Cargo.toml +++ b/components/test_pd/Cargo.toml @@ -10,8 +10,11 @@ fail = "0.5" futures = "0.3" grpcio = { workspace = true } kvproto = { workspace = true } +log_wrappers = { workspace = true } pd_client = { workspace = true } security = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tikv_util = { workspace = true } +tokio = { version = "1.0", features = ["full"] } +tokio-stream = "0.1" diff --git a/components/test_pd/src/lib.rs b/components/test_pd/src/lib.rs index 187a899d7fb..bd768e58318 100644 --- a/components/test_pd/src/lib.rs +++ b/components/test_pd/src/lib.rs @@ -1,4 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(slice_group_by)] #[macro_use] extern crate tikv_util; diff --git a/components/test_pd/src/mocker/etcd.rs b/components/test_pd/src/mocker/etcd.rs new file mode 100644 index 00000000000..3939dfc9a72 --- /dev/null +++ b/components/test_pd/src/mocker/etcd.rs @@ -0,0 +1,288 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cell::Cell, + collections::{BTreeMap, HashMap}, + ops::Bound, + sync::Arc, +}; + +use futures::lock::Mutex; +use tokio::sync::mpsc::{self, Sender}; +use tokio_stream::wrappers::ReceiverStream; + +use super::Result; + +/// An in-memory, single versioned storage. +/// Emulating some interfaces of etcd for testing. +#[derive(Default, Debug)] +pub struct Etcd { + items: BTreeMap, + subs: HashMap, + revision: i64, + sub_id_alloc: Cell, +} + +pub type EtcdClient = Arc>; + +impl Etcd { + fn alloc_rev(&mut self) -> i64 { + self.revision += 1; + self.revision + } + + pub fn get_revision(&self) -> i64 { + self.revision + } + + pub fn get_key(&self, keys: Keys) -> (Vec, i64) { + let (start_key, end_key) = keys.into_bound(); + let kvs = self + .items + .range(( + Bound::Included(&Key(start_key, 0)), + Bound::Excluded(&Key(end_key, self.revision)), + )) + .collect::>() + .as_slice() + .group_by(|item1, item2| item1.0.0 == item2.0.0) + .filter_map(|group| { + let (k, v) = group.last()?; + match v { + Value::Val(val) => Some(KeyValue(MetaKey(k.0.clone()), val.clone())), + Value::Del => None, + } + }) + .fold(Vec::new(), |mut items, item| { + items.push(item); + items + }); + + (kvs, self.get_revision()) + } + + pub async fn set(&mut self, mut pair: KeyValue) -> Result<()> { + let rev = self.alloc_rev(); + for sub in self.subs.values() { + if pair.key() < sub.end_key.as_slice() && pair.key() >= sub.start_key.as_slice() { + sub.tx + .send(KvEvent { + kind: KvEventType::Put, + pair: pair.clone(), + }) + .await + .unwrap(); + } + } + self.items + .insert(Key(pair.take_key(), rev), Value::Val(pair.take_value())); + Ok(()) + } + + pub async fn delete(&mut self, keys: Keys) -> Result<()> { + let (start_key, end_key) = keys.into_bound(); + let rev = self.alloc_rev(); + let mut v = self + .items + .range(( + Bound::Included(Key(start_key, 0)), + Bound::Excluded(Key(end_key, self.revision)), + )) + .map(|(k, _)| Key::clone(k)) + .collect::>(); + v.dedup_by(|k1, k2| k1.0 == k2.0); + + for mut victim in v { + let k = Key(victim.0.clone(), rev); + self.items.insert(k, Value::Del); + + for sub in self.subs.values() { + if victim.0.as_slice() < sub.end_key.as_slice() + && victim.0.as_slice() >= sub.start_key.as_slice() + { + sub.tx + .send(KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(std::mem::take(&mut victim.0)), vec![]), + }) + .await + .unwrap(); + } + } + } + Ok(()) + } + + pub async fn watch(&mut self, keys: Keys, start_rev: i64) -> Result> { + let id = self.sub_id_alloc.get(); + self.sub_id_alloc.set(id + 1); + let (tx, rx) = mpsc::channel(1024); + let (start_key, end_key) = keys.into_bound(); + + // Sending events from [start_rev, now) to the client. + let mut pending = self + .items + .range(( + Bound::Included(Key(start_key.clone(), 0)), + Bound::Excluded(Key(end_key.clone(), self.revision)), + )) + .filter(|(k, _)| k.1 >= start_rev) + .collect::>(); + pending.sort_by_key(|(k, _)| k.1); + for (k, v) in pending { + let event = match v { + Value::Val(val) => KvEvent { + kind: KvEventType::Put, + pair: KeyValue(MetaKey(k.0.clone()), val.clone()), + }, + Value::Del => KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(k.0.clone()), vec![]), + }, + }; + tx.send(event).await.expect("too many pending events"); + } + + self.subs.insert( + id, + Subscriber { + start_key, + end_key, + tx, + }, + ); + Ok(ReceiverStream::new(rx)) + } + + pub fn clear_subs(&mut self) { + self.subs.clear(); + self.sub_id_alloc.set(0); + } + + /// A tool for dumpling the whole storage when test failed. + /// Add this to test code temporarily for debugging. + #[allow(dead_code)] + pub fn dump(&self) { + println!(">>>>>>> /etc (revision = {}) <<<<<<<", self.revision); + for (k, v) in self.items.iter() { + println!("{:?} => {:?}", k, v); + } + } +} + +#[derive(Clone, Debug)] +pub struct MetaKey(pub Vec); + +impl MetaKey { + /// return the key that keeps the range [self, self.next()) contains only + /// `self`. + pub fn next(&self) -> Self { + let mut next = self.clone(); + next.0.push(0); + next + } + + /// return the key that keeps the range [self, self.next_prefix()) contains + /// all keys with the prefix `self`. + pub fn next_prefix(&self) -> Self { + let mut next_prefix = self.clone(); + for i in (0..next_prefix.0.len()).rev() { + if next_prefix.0[i] == u8::MAX { + next_prefix.0.pop(); + } else { + next_prefix.0[i] += 1; + break; + } + } + next_prefix + } +} + +/// A simple key value pair of metadata. +#[derive(Clone, Debug)] +pub struct KeyValue(pub MetaKey, pub Vec); + +impl KeyValue { + pub fn key(&self) -> &[u8] { + self.0.0.as_slice() + } + + pub fn value(&self) -> &[u8] { + self.1.as_slice() + } + + pub fn take_key(&mut self) -> Vec { + std::mem::take(&mut self.0.0) + } + + pub fn take_value(&mut self) -> Vec { + std::mem::take(&mut self.1) + } +} + +#[derive(Debug)] +pub enum KvEventType { + Put, + Delete, +} + +#[derive(Debug)] +pub struct KvEvent { + pub kind: KvEventType, + pub pair: KeyValue, +} + +#[derive(Debug)] +struct Subscriber { + start_key: Vec, + end_key: Vec, + tx: Sender, +} + +/// A key with revision. +#[derive(Default, Eq, PartialEq, Ord, PartialOrd, Clone)] +struct Key(Vec, i64); + +impl std::fmt::Debug for Key { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Key") + .field(&format_args!( + "{}@{}", + log_wrappers::Value::key(&self.0), + self.1 + )) + .finish() + } +} + +/// A value (maybe tombstone.) +#[derive(Debug, PartialEq, Clone)] +enum Value { + Val(Vec), + Del, +} + +/// The key set for getting. +#[derive(Debug)] +pub enum Keys { + Prefix(MetaKey), + Range(MetaKey, MetaKey), + Key(MetaKey), +} + +impl Keys { + /// convert the key set for corresponding key range. + pub fn into_bound(self) -> (Vec, Vec) { + match self { + Keys::Prefix(x) => { + let next = x.next_prefix().0; + ((x.0), (next)) + } + Keys::Range(start, end) => ((start.0), (end.0)), + Keys::Key(k) => { + let next = k.next().0; + ((k.0), (next)) + } + } + } +} diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index d904c95d4a8..fc257b12a9f 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -2,15 +2,18 @@ use std::result; +use futures::executor::block_on; use kvproto::pdpb::*; mod bootstrap; +pub mod etcd; mod incompatible; mod leader_change; mod retry; mod service; mod split; +use self::etcd::{EtcdClient, KeyValue, Keys, MetaKey}; pub use self::{ bootstrap::AlreadyBootstrapped, incompatible::Incompatible, @@ -27,29 +30,64 @@ pub type Result = result::Result; pub trait PdMocker { fn load_global_config( &self, - req: &LoadGlobalConfigRequest, + _req: &LoadGlobalConfigRequest, + etcd_client: EtcdClient, ) -> Option> { - let mut send = vec![]; - for r in req.get_names() { - let mut i = GlobalConfigItem::default(); - i.set_name(format!("/global/config/{}", r.clone())); - i.set_value(r.clone()); - send.push(i); - } let mut res = LoadGlobalConfigResponse::default(); - res.set_items(send.into()); + let mut items = Vec::new(); + let (resp, revision) = block_on(async move { + etcd_client.lock().await.get_key(Keys::Range( + MetaKey(b"".to_vec()), + MetaKey(b"\xff".to_vec()), + )) + }); + + let values: Vec = resp + .iter() + .map(|kv| { + let mut item = GlobalConfigItem::default(); + item.set_name(String::from_utf8(kv.key().to_vec()).unwrap()); + item.set_payload(kv.value().into()); + item + }) + .collect(); + + items.extend(values); + res.set_revision(revision); + res.set_items(items.into()); Some(Ok(res)) } fn store_global_config( &self, - _: &StoreGlobalConfigRequest, + req: &StoreGlobalConfigRequest, + etcd_client: EtcdClient, ) -> Option> { - unimplemented!() + for item in req.get_changes() { + let cli = etcd_client.clone(); + block_on(async move { + match item.get_kind() { + EventType::Put => { + let kv = + KeyValue(MetaKey(item.get_name().into()), item.get_payload().into()); + cli.lock().await.set(kv).await + } + EventType::Delete => { + let key = Keys::Key(MetaKey(item.get_name().into())); + cli.lock().await.delete(key).await + } + } + }) + .unwrap(); + } + Some(Ok(StoreGlobalConfigResponse::default())) } - fn watch_global_config(&self) -> Option> { - panic!("could not mock this function due to it should return a stream") + fn watch_global_config( + &self, + _req: &WatchGlobalConfigRequest, + ) -> Option> { + unimplemented!() } fn get_members(&self, _: &GetMembersRequest) -> Option> { diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 9e1a2b3bb0f..28d4077b674 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -1,6 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + str::from_utf8, sync::{ atomic::{AtomicI64, Ordering}, Arc, @@ -20,6 +21,7 @@ use pd_client::Error as PdError; use security::*; use super::mocker::*; +use crate::mocker::etcd::{EtcdClient, Keys, KvEventType, MetaKey}; pub struct Server { server: Option, @@ -57,6 +59,7 @@ impl Server { default_handler, case, tso_logical: Arc::new(AtomicI64::default()), + etcd_client: EtcdClient::default(), }; let mut server = Server { server: None, @@ -170,6 +173,7 @@ struct PdMock { default_handler: Arc, case: Option>, tso_logical: Arc, + etcd_client: EtcdClient, } impl Clone for PdMock { @@ -178,6 +182,7 @@ impl Clone for PdMock { default_handler: Arc::clone(&self.default_handler), case: self.case.clone(), tso_logical: self.tso_logical.clone(), + etcd_client: self.etcd_client.clone(), } } } @@ -189,39 +194,71 @@ impl Pd for PdMock { req: LoadGlobalConfigRequest, sink: UnarySink, ) { - hijack_unary(self, ctx, sink, |c| c.load_global_config(&req)) + let cli = self.etcd_client.clone(); + hijack_unary(self, ctx, sink, |c| c.load_global_config(&req, cli.clone())) } fn store_global_config( &mut self, - _ctx: RpcContext<'_>, - _req: StoreGlobalConfigRequest, - _sink: UnarySink, + ctx: RpcContext<'_>, + req: StoreGlobalConfigRequest, + sink: UnarySink, ) { - unimplemented!() + let cli = self.etcd_client.clone(); + hijack_unary(self, ctx, sink, |c| { + c.store_global_config(&req, cli.clone()) + }) } fn watch_global_config( &mut self, ctx: RpcContext<'_>, - _req: WatchGlobalConfigRequest, + req: WatchGlobalConfigRequest, mut sink: ServerStreamingSink, ) { - ctx.spawn(async move { - let mut name: usize = 0; - loop { + let cli = self.etcd_client.clone(); + let future = async move { + let mut watcher = match cli + .lock() + .await + .watch( + Keys::Range(MetaKey(b"".to_vec()), MetaKey(b"\xff".to_vec())), + req.revision, + ) + .await + { + Ok(w) => w, + Err(err) => { + error!("failed to watch: {:?}", err); + return; + } + }; + + while let Some(event) = watcher.as_mut().recv().await { + info!("watch event from etcd"; "event" => ?event); let mut change = GlobalConfigItem::new(); - change.set_name(format!("/global/config/{:?}", name).to_owned()); - change.set_value(format!("{:?}", name)); + change.set_kind(match event.kind { + KvEventType::Put => EventType::Put, + KvEventType::Delete => EventType::Delete, + }); + change.set_name(from_utf8(event.pair.key()).unwrap().to_string()); + change.set_payload(event.pair.value().into()); let mut wc = WatchGlobalConfigResponse::default(); wc.set_changes(vec![change].into()); - // simulate network delay - std::thread::sleep(Duration::from_millis(10)); - name += 1; let _ = sink.send((wc, WriteFlags::default())).await; let _ = sink.flush().await; + #[cfg(feature = "failpoints")] + { + use futures::executor::block_on; + let cli_clone = cli.clone(); + fail_point!("watch_global_config_return", |_| { + block_on(async move { cli_clone.lock().await.clear_subs() }); + watcher.close(); + }); + } } - }) + }; + ctx.spawn(future); } fn get_members( diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index 513d08643a7..a76692c4a67 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -27,8 +27,8 @@ use keys::{self, data_key, enc_end_key, enc_start_key}; use kvproto::{ metapb::{self, PeerRole}, pdpb::{ - self, ChangePeer, ChangePeerV2, CheckPolicy, Merge, RegionHeartbeatResponse, SplitRegion, - TransferLeader, + self, BatchSwitchWitness, ChangePeer, ChangePeerV2, CheckPolicy, Merge, + RegionHeartbeatResponse, SplitRegion, SwitchWitness, TransferLeader, }, replication_modepb::{ DrAutoSyncState, RegionReplicationStatus, ReplicationMode, ReplicationStatus, @@ -40,7 +40,7 @@ use pd_client::{ }; use raft::eraftpb::ConfChangeType; use tikv_util::{ - store::{check_key_in_region, find_peer, is_learner, new_peer, QueryStats}, + store::{check_key_in_region, find_peer, find_peer_by_id, is_learner, new_peer, QueryStats}, time::{Instant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, Either, HandyRwLock, @@ -135,6 +135,11 @@ enum Operator { remove_peers: Vec, policy: SchedulePolicy, }, + BatchSwitchWitness { + peer_ids: Vec, + is_witnesses: Vec, + policy: SchedulePolicy, + }, } pub fn sleep_ms(ms: u64) { @@ -201,6 +206,22 @@ pub fn new_pd_merge_region(target_region: metapb::Region) -> RegionHeartbeatResp resp } +fn switch_witness(peer_id: u64, is_witness: bool) -> SwitchWitness { + let mut sw = SwitchWitness::default(); + sw.set_peer_id(peer_id); + sw.set_is_witness(is_witness); + sw +} + +pub fn new_pd_batch_switch_witnesses(switches: Vec) -> RegionHeartbeatResponse { + let mut switch_witnesses = BatchSwitchWitness::default(); + switch_witnesses.set_switch_witnesses(switches.into()); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_switch_witnesses(switch_witnesses); + resp +} + impl Operator { fn make_region_heartbeat_response( &self, @@ -276,6 +297,17 @@ impl Operator { } new_pd_change_peer_v2(cps) } + Operator::BatchSwitchWitness { + ref peer_ids, + ref is_witnesses, + .. + } => { + let mut switches = Vec::with_capacity(peer_ids.len()); + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + switches.push(switch_witness(*peer_id, *is_witness)); + } + new_pd_batch_switch_witnesses(switches) + } } } @@ -360,6 +392,26 @@ impl Operator { add && remove || !policy.schedule() } + Operator::BatchSwitchWitness { + ref peer_ids, + ref is_witnesses, + ref mut policy, + } => { + if !policy.schedule() { + return true; + } + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + if region + .get_peers() + .iter() + .any(|p| (p.get_id() == *peer_id) && (p.get_is_witness() != *is_witness)) + || cluster.pending_peers.contains_key(peer_id) + { + return false; + } + } + true + } } } } @@ -1043,6 +1095,48 @@ impl TestPdClient { panic!("region {:?} failed to leave joint", region); } + pub fn must_finish_switch_witnesses( + &self, + region_id: u64, + peer_ids: Vec, + is_witnesses: Vec, + ) { + for _ in 1..500 { + sleep_ms(10); + let region = match block_on(self.get_region_by_id(region_id)).unwrap() { + Some(region) => region, + None => continue, + }; + + for p in region.get_peers().iter() { + error!("in must_finish_switch_witnesses, p: {:?}", p); + } + + let mut need_retry = false; + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + match find_peer_by_id(®ion, *peer_id) { + Some(p) => { + if p.get_is_witness() != *is_witness + || self.cluster.rl().pending_peers.contains_key(&p.get_id()) + { + need_retry = true; + break; + } + } + None => { + need_retry = true; + break; + } + } + } + if !need_retry { + return; + } + } + let region = block_on(self.get_region_by_id(region_id)).unwrap(); + panic!("region {:?} failed to finish switch witnesses", region); + } + pub fn add_region(&self, region: &metapb::Region) { self.cluster.wl().add_region(region) } @@ -1072,6 +1166,15 @@ impl TestPdClient { self.schedule_operator(region_id, op); } + pub fn switch_witnesses(&self, region_id: u64, peer_ids: Vec, is_witnesses: Vec) { + let op = Operator::BatchSwitchWitness { + peer_ids, + is_witnesses, + policy: SchedulePolicy::TillSuccess, + }; + self.schedule_operator(region_id, op); + } + pub fn joint_confchange( &self, region_id: u64, @@ -1189,6 +1292,16 @@ impl TestPdClient { self.must_none_peer(region_id, peer); } + pub fn must_switch_witnesses( + &self, + region_id: u64, + peer_ids: Vec, + is_witnesses: Vec, + ) { + self.switch_witnesses(region_id, peer_ids.clone(), is_witnesses.clone()); + self.must_finish_switch_witnesses(region_id, peer_ids, is_witnesses); + } + pub fn must_joint_confchange( &self, region_id: u64, diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index 71c214ae21d..25a1224e261 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -49,6 +49,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raftstore = { workspace = true, features = ["testexport"] } rand = "0.8" resolved_ts = { workspace = true } +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true } server = { workspace = true } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index b2330e26f93..81e7129407e 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -46,6 +46,7 @@ use raftstore::{ }, Error, Result, }; +use resource_control::ResourceGroupManager; use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::server::Result as ServerResult; @@ -80,6 +81,7 @@ pub trait Simulator { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Option>, ) -> ServerResult; fn stop_node(&mut self, node_id: u64); fn get_node_ids(&self) -> HashSet; @@ -174,6 +176,7 @@ pub struct Cluster { pub raft_statistics: Vec>>, pub sim: Arc>, pub pd_client: Arc, + resource_manager: Option>, } impl Cluster { @@ -207,6 +210,7 @@ impl Cluster { pd_client, sst_workers: vec![], sst_workers_map: HashMap::default(), + resource_manager: Some(Arc::new(ResourceGroupManager::default())), kv_statistics: vec![], raft_statistics: vec![], } @@ -275,7 +279,8 @@ impl Cluster { // Try start new nodes. for _ in 0..self.count - self.engines.len() { - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); self.create_engine(Some(router.clone())); let engines = self.dbs.last().unwrap().clone(); @@ -294,6 +299,7 @@ impl Cluster { key_mgr.clone(), router, system, + &self.resource_manager, )?; self.group_props.insert(node_id, props); self.engines.insert(node_id, engines); @@ -345,7 +351,8 @@ impl Cluster { debug!("starting node {}", node_id); let engines = self.engines[&node_id].clone(); let key_mgr = self.key_managers_map[&node_id].clone(); - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); let mut cfg = self.cfg.clone(); if let Some(labels) = self.labels.get(&node_id) { cfg.server.labels = labels.to_owned(); @@ -365,9 +372,16 @@ impl Cluster { tikv_util::thread_group::set_properties(Some(props)); debug!("calling run node"; "node_id" => node_id); // FIXME: rocksdb event listeners may not work, because we change the router. - self.sim - .wl() - .run_node(node_id, cfg, engines, store_meta, key_mgr, router, system)?; + self.sim.wl().run_node( + node_id, + cfg, + engines, + store_meta, + key_mgr, + router, + system, + &self.resource_manager, + )?; debug!("node {} started", node_id); Ok(()) } diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 78d98e5a5d3..05ed8ece83d 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -30,6 +30,7 @@ use raftstore::{ }, Result, }; +use resource_control::ResourceGroupManager; use resource_metering::CollectorRegHandle; use tempfile::TempDir; use test_pd_client::TestPdClient; @@ -229,6 +230,7 @@ impl Simulator for NodeCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + _resource_manager: &Option>, ) -> ServerResult { assert!(node_id == 0 || !self.nodes.contains_key(&node_id)); let pd_worker = LazyWorker::new("test-pd-worker"); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 0ec60e468ee..63a0b4e4804 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -42,6 +42,7 @@ use raftstore::{ }, Result, }; +use resource_control::ResourceGroupManager; use resource_metering::{CollectorRegHandle, ResourceTagFactory}; use security::SecurityManager; use tempfile::TempDir; @@ -264,6 +265,7 @@ impl ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Option>, ) -> ServerResult { let (tmp_str, tmp) = if node_id == 0 || !self.snap_paths.contains_key(&node_id) { let p = test_util::temp_dir("test_cluster", cfg.prefer_mem); @@ -414,6 +416,9 @@ impl ServerCluster { quota_limiter.clone(), self.pd_client.feature_gate().clone(), self.get_causal_ts_provider(node_id), + resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), )?; self.storages.insert(node_id, raft_engine); @@ -649,6 +654,7 @@ impl Simulator for ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Option>, ) -> ServerResult { dispatch_api_version!( cfg.storage.api_version(), @@ -660,6 +666,7 @@ impl Simulator for ServerCluster { key_manager, router, system, + resource_manager, ) ) } diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index d5c2eefa6d6..4bcb99adca3 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -16,7 +16,8 @@ use encryption_export::{ use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, ALL_CFS, CF_DEFAULT, CF_RAFT, + CfNamesExt, Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, CF_DEFAULT, + CF_RAFT, }; use file_system::IoRateLimiter; use futures::executor::block_on; @@ -101,7 +102,7 @@ pub fn must_region_cleared(engine: &Engines, region assert_eq!(state.get_state(), PeerState::Tombstone, "{:?}", state); let start_key = keys::data_key(region.get_start_key()); let end_key = keys::data_key(region.get_end_key()); - for cf in ALL_CFS { + for cf in engine.kv.cf_names() { engine .kv .scan(cf, &start_key, &end_key, false, |k, v| { @@ -1245,15 +1246,9 @@ pub fn must_raw_get(client: &TikvClient, ctx: Context, key: Vec) -> Option { +pub struct RangesScanner { storage: T, ranges_iter: RangesIterator, @@ -34,6 +35,8 @@ pub struct RangesScanner { working_range_begin_key: Vec, working_range_end_key: Vec, rescheduler: RescheduleChecker, + + _phantom: PhantomData, } // TODO: maybe it's better to make it generic to avoid directly depending @@ -72,7 +75,7 @@ pub struct RangesScannerOptions { pub is_scanned_range_aware: bool, // TODO: This can be const generics } -impl RangesScanner { +impl RangesScanner { pub fn new( RangesScannerOptions { storage, @@ -81,7 +84,7 @@ impl RangesScanner { is_key_only, is_scanned_range_aware, }: RangesScannerOptions, - ) -> RangesScanner { + ) -> RangesScanner { let ranges_len = ranges.len(); let ranges_iter = RangesIterator::new(ranges); RangesScanner { @@ -98,13 +101,14 @@ impl RangesScanner { working_range_begin_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), working_range_end_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), rescheduler: RescheduleChecker::new(), + _phantom: PhantomData, } } /// Fetches next row. // Note: This is not implemented over `Iterator` since it can fail. // TODO: Change to use reference to avoid allocation and copy. - pub async fn next(&mut self) -> Result, StorageError> { + pub async fn next(&mut self) -> Result, StorageError> { self.next_opt(true).await } @@ -114,7 +118,7 @@ impl RangesScanner { pub async fn next_opt( &mut self, update_scanned_range: bool, - ) -> Result, StorageError> { + ) -> Result, StorageError> { loop { let mut force_check = true; let range = self.ranges_iter.next(); @@ -150,14 +154,14 @@ impl RangesScanner { if self.is_scanned_range_aware && update_scanned_range { self.update_scanned_range_from_scanned_row(&some_row); } - if some_row.is_some() { + if let Some(row) = some_row { // Retrieved one row from point range or interval range. if let Some(r) = self.scanned_rows_per_range.last_mut() { *r += 1; } self.rescheduler.check_reschedule(force_check).await; - - return Ok(some_row); + let kv = F::make_kv_pair(row).map_err(|e| StorageError(anyhow::Error::from(e)))?; + return Ok(Some(kv)); } else { // No more row in the range. self.ranges_iter.notify_drained(); @@ -288,6 +292,7 @@ impl RangesScanner { #[cfg(test)] mod tests { + use api_version::{keyspace::KvPair, ApiV1}; use futures::executor::block_on; use super::*; @@ -315,7 +320,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "c")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -323,24 +328,24 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), b"1".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), b"1".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), b"3".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), b"3".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), b"5".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), b"5".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), b"2".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), b"2".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar_2".to_vec(), b"4".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar_2".to_vec(), b"4".to_vec()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); @@ -351,7 +356,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "bar_2")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -359,20 +364,20 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), b"3".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), b"3".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), b"1".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), b"1".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), b"5".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), b"5".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), b"2".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), b"2".to_vec()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); @@ -382,7 +387,7 @@ mod tests { PointRange::from("foo_3").into(), PointRange::from("bar_3").into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -390,24 +395,24 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar_2".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar_2".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), Vec::new()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); } @@ -422,7 +427,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "z")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -431,9 +436,9 @@ mod tests { }); let mut scanned_rows_per_range = Vec::new(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![2, 0, 1]); @@ -443,21 +448,21 @@ mod tests { assert_eq!(scanned_rows_per_range, vec![0]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![0, 2]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![1]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); assert_eq!(block_on(scanner.next()).unwrap(), None); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); @@ -477,7 +482,7 @@ mod tests { // No range let ranges = vec![]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -497,7 +502,7 @@ mod tests { // Empty interval range let ranges = vec![IntervalRange::from(("x", "xb")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -513,7 +518,7 @@ mod tests { // Empty point range let ranges = vec![PointRange::from("x").into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -529,7 +534,7 @@ mod tests { // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -537,14 +542,14 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); @@ -567,7 +572,7 @@ mod tests { PointRange::from("bar_3").into(), IntervalRange::from(("bar_4", "box")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -575,25 +580,25 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo\0"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); assert_eq!(&r.upper_exclusive, b"bar\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar\0"); @@ -612,7 +617,7 @@ mod tests { // No range let ranges = vec![]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -632,7 +637,7 @@ mod tests { // Empty interval range let ranges = vec![IntervalRange::from(("x", "xb")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -648,7 +653,7 @@ mod tests { // Empty point range let ranges = vec![PointRange::from("x").into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -664,7 +669,7 @@ mod tests { // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -672,14 +677,14 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2"); assert_eq!(&r.upper_exclusive, b"foo_8"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -700,7 +705,7 @@ mod tests { IntervalRange::from(("foo_5", "foo_50")).into(), IntervalRange::from(("foo", "foo_3")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: true, @@ -708,20 +713,20 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar_2"); assert_eq!(&r.upper_exclusive, b"box"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar"); assert_eq!(&r.upper_exclusive, b"bar_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -739,7 +744,7 @@ mod tests { let storage = create_storage(); // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -749,7 +754,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -757,7 +762,7 @@ mod tests { // Upper_exclusive is updated. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -765,7 +770,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_3" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -791,7 +796,7 @@ mod tests { PointRange::from("bar_3").into(), IntervalRange::from(("bar_4", "box")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -801,7 +806,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -809,7 +814,7 @@ mod tests { // Upper_exclusive is updated. Updated by scanned row. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -817,7 +822,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -825,7 +830,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -846,7 +851,7 @@ mod tests { let storage = create_storage(); // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -856,7 +861,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_3" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -864,7 +869,7 @@ mod tests { // Upper_exclusive is updated. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -872,7 +877,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -896,7 +901,7 @@ mod tests { IntervalRange::from(("foo_5", "foo_50")).into(), IntervalRange::from(("foo", "foo_3")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: true, @@ -906,7 +911,7 @@ mod tests { // Lower_inclusive is updated. Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar_2" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -914,7 +919,7 @@ mod tests { // Upper_exclusive is updated. Updated by scanned row. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"bar" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -922,7 +927,7 @@ mod tests { // Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -930,7 +935,7 @@ mod tests { // Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"box"); diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index e9d96e16284..e670674cdc6 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -6,6 +6,7 @@ publish = false description = "Data type of a query engine to run TiDB pushed down executors" [dependencies] +api_version = { workspace = true } base64 = "0.13" bitfield = "0.13.2" bitflags = "1.0.1" diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 00f6c22347b..37becbfb801 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -2,6 +2,7 @@ use std::{cmp, convert::TryInto, io::Write, sync::Arc, u8}; +use api_version::KvFormat; use codec::prelude::*; use collections::{HashMap, HashSet}; use kvproto::coprocessor::KeyRange; @@ -75,10 +76,13 @@ pub fn extract_table_prefix(key: &[u8]) -> Result<&[u8]> { } /// Checks if the range is for table record or index. -pub fn check_table_ranges(ranges: &[KeyRange]) -> Result<()> { +pub fn check_table_ranges(ranges: &[KeyRange]) -> Result<()> { for range in ranges { - extract_table_prefix(range.get_start())?; - extract_table_prefix(range.get_end())?; + let (_, start) = + F::parse_keyspace(range.get_start()).map_err(|e| Error::Other(Box::new(e)))?; + let (_, end) = F::parse_keyspace(range.get_end()).map_err(|e| Error::Other(Box::new(e)))?; + extract_table_prefix(start)?; + extract_table_prefix(end)?; if range.get_start() >= range.get_end() { return Err(invalid_type!( "invalid range,range.start should be smaller than range.end, but got [{:?},{:?})", @@ -544,6 +548,7 @@ pub fn generate_index_data_for_test( mod tests { use std::{i64, iter::FromIterator}; + use api_version::ApiV1; use collections::{HashMap, HashSet}; use tipb::ColumnInfo; @@ -790,18 +795,18 @@ mod tests { let mut range = KeyRange::default(); range.set_start(small_key.clone()); range.set_end(large_key.clone()); - check_table_ranges(&[range]).unwrap(); + check_table_ranges::(&[range]).unwrap(); // test range.start > range.end let mut range = KeyRange::default(); range.set_end(small_key.clone()); range.set_start(large_key); - check_table_ranges(&[range]).unwrap_err(); + check_table_ranges::(&[range]).unwrap_err(); // test invalid end let mut range = KeyRange::default(); range.set_start(small_key); range.set_end(b"xx".to_vec()); - check_table_ranges(&[range]).unwrap_err(); + check_table_ranges::(&[range]).unwrap_err(); } #[test] diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index 123c306c125..331634dbd04 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -6,6 +6,7 @@ publish = false description = "A vector query engine to run TiDB pushed down executors" [dependencies] +api_version = { workspace = true } async-trait = "0.1" codec = { workspace = true } collections = { workspace = true } diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index ae04ffe03e6..9e415918541 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -2,6 +2,7 @@ use std::sync::Arc; +use api_version::{ApiV1, KvFormat}; use async_trait::async_trait; use codec::{number::NumberCodec, prelude::NumberDecoder}; use itertools::izip; @@ -30,11 +31,13 @@ use DecodeHandleStrategy::*; use super::util::scan_executor::*; use crate::interface::*; -pub struct BatchIndexScanExecutor(ScanExecutor); +pub struct BatchIndexScanExecutor( + ScanExecutor, +); // We assign a dummy type `Box>` so that we can // omit the type when calling `check_supported`. -impl BatchIndexScanExecutor>> { +impl BatchIndexScanExecutor>, ApiV1> { /// Checks whether this executor can be used. #[inline] pub fn check_supported(descriptor: &IndexScan) -> Result<()> { @@ -42,7 +45,7 @@ impl BatchIndexScanExecutor>> { } } -impl BatchIndexScanExecutor { +impl BatchIndexScanExecutor { pub fn new( storage: S, config: Arc, @@ -154,7 +157,7 @@ impl BatchIndexScanExecutor { } #[async_trait] -impl BatchExecutor for BatchIndexScanExecutor { +impl BatchExecutor for BatchIndexScanExecutor { type StorageStats = S::Statistics; #[inline] @@ -975,7 +978,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![columns_info[0].clone(), columns_info[1].clone()], @@ -1028,7 +1031,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1092,7 +1095,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![columns_info[1].clone(), columns_info[0].clone()], @@ -1133,7 +1136,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1185,7 +1188,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), vec![ @@ -1262,7 +1265,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1319,7 +1322,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), vec![ @@ -1433,7 +1436,7 @@ mod tests { let mut value = value_prefix.clone(); value.extend(restore_data); let store = FixtureStorage::from(vec![(key.clone(), value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1476,7 +1479,7 @@ mod tests { let value = value_prefix; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1572,7 +1575,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, vec![])]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1672,7 +1675,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1766,7 +1769,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1859,7 +1862,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1985,7 +1988,7 @@ mod tests { let mut value = value_prefix; value.extend(restore_data); let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 551c3da8a7e..392b41ff165 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -2,6 +2,7 @@ use std::{convert::TryFrom, sync::Arc}; +use api_version::KvFormat; use fail::fail_point; use kvproto::coprocessor::KeyRange; use protobuf::Message; @@ -149,6 +150,15 @@ impl BatchExecutorsRunner<()> { ExecType::TypePartitionTableScan => { other_err!("PartitionTableScan executor not implemented"); } + ExecType::TypeSort => { + other_err!("Sort executor not implemented"); + } + ExecType::TypeWindow => { + other_err!("Window executor not implemented"); + } + ExecType::TypeExpand => { + other_err!("Expand executor not implemented"); + } } } @@ -164,7 +174,7 @@ fn is_arrow_encodable(schema: &[FieldType]) -> bool { } #[allow(clippy::explicit_counter_loop)] -pub fn build_executors( +pub fn build_executors( executor_descriptors: Vec, storage: S, ranges: Vec, @@ -192,7 +202,7 @@ pub fn build_executors( let primary_prefix_column_ids = descriptor.take_primary_prefix_column_ids(); Box::new( - BatchTableScanExecutor::new( + BatchTableScanExecutor::<_, F>::new( storage, config.clone(), columns_info, @@ -212,7 +222,7 @@ pub fn build_executors( let columns_info = descriptor.take_columns().into(); let primary_column_ids_len = descriptor.take_primary_column_ids().len(); Box::new( - BatchIndexScanExecutor::new( + BatchIndexScanExecutor::<_, F>::new( storage, config.clone(), columns_info, @@ -364,7 +374,7 @@ pub fn build_executors( } impl BatchExecutorsRunner { - pub fn from_request + 'static>( + pub fn from_request + 'static, F: KvFormat>( mut req: DagRequest, ranges: Vec, storage: S, @@ -380,7 +390,7 @@ impl BatchExecutorsRunner { config.paging_size = paging_size; let config = Arc::new(config); - let out_most_executor = build_executors( + let out_most_executor = build_executors::<_, F>( req.take_executors().into(), storage, ranges, diff --git a/components/tidb_query_executors/src/table_scan_executor.rs b/components/tidb_query_executors/src/table_scan_executor.rs index 957a23ba8c0..4397869fcaa 100644 --- a/components/tidb_query_executors/src/table_scan_executor.rs +++ b/components/tidb_query_executors/src/table_scan_executor.rs @@ -2,6 +2,7 @@ use std::{collections::HashSet, sync::Arc}; +use api_version::{ApiV1, KvFormat}; use async_trait::async_trait; use collections::HashMap; use kvproto::coprocessor::KeyRange; @@ -23,13 +24,15 @@ use tipb::{ColumnInfo, FieldType, TableScan}; use super::util::scan_executor::*; use crate::interface::*; -pub struct BatchTableScanExecutor(ScanExecutor); +pub struct BatchTableScanExecutor( + ScanExecutor, +); type HandleIndicesVec = SmallVec<[usize; 2]>; // We assign a dummy type `Box>` so that we can // omit the type when calling `check_supported`. -impl BatchTableScanExecutor>> { +impl BatchTableScanExecutor>, ApiV1> { /// Checks whether this executor can be used. #[inline] pub fn check_supported(descriptor: &TableScan) -> Result<()> { @@ -37,7 +40,7 @@ impl BatchTableScanExecutor>> { } } -impl BatchTableScanExecutor { +impl BatchTableScanExecutor { #[allow(clippy::too_many_arguments)] pub fn new( storage: S, @@ -110,7 +113,7 @@ impl BatchTableScanExecutor { } #[async_trait] -impl BatchExecutor for BatchTableScanExecutor { +impl BatchExecutor for BatchTableScanExecutor { type StorageStats = S::Statistics; #[inline] @@ -702,7 +705,7 @@ mod tests { batch_expect_rows: &[usize], ) { let columns_info = helper.columns_info_by_idx(col_idxs); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( helper.store(), Arc::new(EvalConfig::default()), columns_info, @@ -786,7 +789,7 @@ mod tests { fn test_execution_summary() { let helper = TableScanTestHelper::new(); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( helper.store(), Arc::new(EvalConfig::default()), helper.columns_info_by_idx(&[0]), @@ -925,7 +928,7 @@ mod tests { // For row 0 + row 1 + (row 2 ~ row 4), we should only get row 0, row 1 and an // error. for corrupted_row_index in 2..=4 { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1032,7 +1035,7 @@ mod tests { // We should get row 0 and error because no further rows should be scanned when // there is an error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1080,7 +1083,7 @@ mod tests { }); let mut schema = schema.clone(); schema.push(FieldTypeTp::LongLong.into()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info, @@ -1122,7 +1125,7 @@ mod tests { // Let's also repeat case 1 for smaller batch size { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1165,7 +1168,7 @@ mod tests { // Case 2: row 1 + row 2 // We should get error and no row, for the same reason as above. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1186,7 +1189,7 @@ mod tests { // Case 3: row 2 + row 0 // We should get row 2 and row 0. There is no error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1220,7 +1223,7 @@ mod tests { // Case 4: row 1 // We should get error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1270,7 +1273,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1378,7 +1381,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1559,7 +1562,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info.clone(), diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 6ef8c6b2224..5ebc65baa25 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -1,20 +1,23 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Ordering, collections::BinaryHeap, ptr::NonNull, sync::Arc}; +use std::sync::Arc; use async_trait::async_trait; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ - codec::{ - batch::{LazyBatchColumn, LazyBatchColumnVec}, - data_type::*, - }, + codec::{batch::LazyBatchColumnVec, data_type::*}, expr::{EvalConfig, EvalContext, EvalWarnings}, }; use tidb_query_expr::{RpnExpression, RpnExpressionBuilder, RpnStackNode}; use tipb::{Expr, FieldType, TopN}; -use crate::{interface::*, util::*}; +use crate::{ + interface::*, + util::{ + top_n_heap::{HeapItemSourceData, HeapItemUnsafe, TopNHeap}, + *, + }, +}; pub struct BatchTopNExecutor { /// The heap, which contains N rows at most. @@ -22,7 +25,7 @@ pub struct BatchTopNExecutor { /// This field is placed before `eval_columns_buffer_unsafe`, `order_exprs`, /// `order_is_desc` and `src` because it relies on data in those fields /// and we want this field to be dropped first. - heap: BinaryHeap, + heap: TopNHeap, /// A collection of all evaluated columns. This is to avoid repeated /// allocations in each `next_batch()`. @@ -97,7 +100,7 @@ impl BatchTopNExecutor { .collect(); Self { - heap: BinaryHeap::new(), + heap: TopNHeap::new(n), eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), @@ -126,7 +129,7 @@ impl BatchTopNExecutor { .collect(); Self { - heap: BinaryHeap::new(), + heap: TopNHeap::new(n), eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), @@ -140,7 +143,7 @@ impl BatchTopNExecutor { } pub fn new( - config: std::sync::Arc, + config: Arc, src: Src, order_exprs_def: Vec, order_is_desc: Vec, @@ -163,8 +166,7 @@ impl BatchTopNExecutor { .collect(); Ok(Self { - // Avoid large N causing OOM - heap: BinaryHeap::with_capacity(n.min(1024)), + heap: TopNHeap::new(n), // Simply large enough to avoid repeated allocations eval_columns_buffer_unsafe: Box::new(Vec::with_capacity(512)), order_exprs: order_exprs.into_boxed_slice(), @@ -182,7 +184,7 @@ impl BatchTopNExecutor { async fn handle_next_batch(&mut self) -> Result> { // Use max batch size from the beginning because top N // always needs to calculate over all data. - let src_result = self.src.next_batch(crate::runner::BATCH_MAX_SIZE).await; + let src_result = self.src.next_batch(BATCH_MAX_SIZE).await; self.context.warnings = src_result.warnings; @@ -193,7 +195,7 @@ impl BatchTopNExecutor { } if src_is_drained { - Ok(Some(self.heap_take_all())) + Ok(Some(self.heap.take_all())) } else { Ok(None) } @@ -240,84 +242,11 @@ impl BatchTopNExecutor { eval_columns_offset: eval_offset, logical_row_index, }; - self.heap_add_row(row)?; - } - - Ok(()) - } - - fn heap_add_row(&mut self, row: HeapItemUnsafe) -> Result<()> { - if self.heap.len() < self.n { - // HeapItemUnsafe must be checked valid to compare in advance, or else it may - // panic inside BinaryHeap. - row.cmp_sort_key(&row)?; - - // Push into heap when heap is not full. - self.heap.push(row); - } else { - // Swap the greatest row in the heap if this row is smaller than that row. - let mut greatest_row = self.heap.peek_mut().unwrap(); - if row.cmp_sort_key(&greatest_row)? == Ordering::Less { - *greatest_row = row; - } + self.heap.add_row(row)?; } Ok(()) } - - #[allow(clippy::clone_on_copy)] - fn heap_take_all(&mut self) -> LazyBatchColumnVec { - let heap = std::mem::take(&mut self.heap); - let sorted_items = heap.into_sorted_vec(); - if sorted_items.is_empty() { - return LazyBatchColumnVec::empty(); - } - - let mut result = sorted_items[0] - .source_data - .physical_columns - .clone_empty(sorted_items.len()); - - for (column_index, result_column) in result.as_mut_slice().iter_mut().enumerate() { - match result_column { - LazyBatchColumn::Raw(dest_column) => { - for item in &sorted_items { - let src = item.source_data.physical_columns[column_index].raw(); - dest_column - .push(&src[item.source_data.logical_rows[item.logical_row_index]]); - } - } - LazyBatchColumn::Decoded(dest_vector_value) => { - match_template::match_template! { - TT = [ - Int, - Real, - Duration, - Decimal, - DateTime, - Bytes => BytesRef, - Json => JsonRef, - Enum => EnumRef, - Set => SetRef, - ], - match dest_vector_value { - VectorValue::TT(dest_column) => { - for item in &sorted_items { - let src: &VectorValue = item.source_data.physical_columns[column_index].decoded(); - let src_ref = TT::borrow_vector_value(src); - // TODO: This clone is not necessary. - dest_column.push(src_ref.get_option_ref(item.source_data.logical_rows[item.logical_row_index]).map(|x| x.into_owned_value())); - } - }, - } - } - } - } - } - - result.assert_columns_equal_length(); - result - } } #[async_trait] @@ -402,111 +331,6 @@ impl BatchExecutor for BatchTopNExecutor { } } -struct HeapItemSourceData { - physical_columns: LazyBatchColumnVec, - logical_rows: Vec, -} - -/// The item in the heap of `BatchTopNExecutor`. -/// -/// WARN: The content of this structure is valid only if `BatchTopNExecutor` is -/// valid (i.e. not dropped). Thus it is called unsafe. -struct HeapItemUnsafe { - /// A pointer to the `order_is_desc` field in `BatchTopNExecutor`. - order_is_desc_ptr: NonNull<[bool]>, - - /// A pointer to the `order_exprs_field_type` field in `order_exprs`. - order_exprs_field_type_ptr: NonNull<[FieldType]>, - - /// The source data that evaluated column in this structure is using. - source_data: Arc, - - /// A pointer to the `eval_columns_buffer` field in `BatchTopNExecutor`. - eval_columns_buffer_ptr: NonNull>>, - - /// The begin offset of the evaluated columns stored in the buffer. - /// - /// The length of evaluated columns in the buffer is `order_is_desc.len()`. - eval_columns_offset: usize, - - /// Which logical row in the evaluated columns this heap item is - /// representing. - logical_row_index: usize, -} - -impl HeapItemUnsafe { - fn get_order_is_desc(&self) -> &[bool] { - unsafe { self.order_is_desc_ptr.as_ref() } - } - - fn get_order_exprs_field_type(&self) -> &[FieldType] { - unsafe { self.order_exprs_field_type_ptr.as_ref() } - } - - fn get_eval_columns(&self, len: usize) -> &[RpnStackNode<'_>] { - let offset_begin = self.eval_columns_offset; - let offset_end = offset_begin + len; - let vec_buf = unsafe { self.eval_columns_buffer_ptr.as_ref() }; - &vec_buf[offset_begin..offset_end] - } - - fn cmp_sort_key(&self, other: &Self) -> Result { - // Only debug assert because this function is called pretty frequently. - debug_assert_eq!(self.get_order_is_desc(), other.get_order_is_desc()); - - let order_is_desc = self.get_order_is_desc(); - let order_exprs_field_type = self.get_order_exprs_field_type(); - let columns_len = order_is_desc.len(); - let eval_columns_lhs = self.get_eval_columns(columns_len); - let eval_columns_rhs = other.get_eval_columns(columns_len); - - for column_idx in 0..columns_len { - let lhs_node = &eval_columns_lhs[column_idx]; - let rhs_node = &eval_columns_rhs[column_idx]; - let lhs = lhs_node.get_logical_scalar_ref(self.logical_row_index); - let rhs = rhs_node.get_logical_scalar_ref(other.logical_row_index); - - // There is panic inside, but will never panic, since the data type of - // corresponding column should be consistent for each - // `HeapItemUnsafe`. - let ord = lhs.cmp_sort_key(&rhs, &order_exprs_field_type[column_idx])?; - - if ord == Ordering::Equal { - continue; - } - if !order_is_desc[column_idx] { - return Ok(ord); - } else { - return Ok(ord.reverse()); - } - } - - Ok(Ordering::Equal) - } -} - -/// WARN: HeapItemUnsafe implements partial ordering. It panics when Collator -/// fails to parse. So make sure that it is valid before putting it into a heap. -impl Ord for HeapItemUnsafe { - fn cmp(&self, other: &Self) -> Ordering { - self.cmp_sort_key(other).unwrap() - } -} - -impl PartialOrd for HeapItemUnsafe { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for HeapItemUnsafe { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl Eq for HeapItemUnsafe {} - #[cfg(test)] mod tests { use futures::executor::block_on; diff --git a/components/tidb_query_executors/src/util/mod.rs b/components/tidb_query_executors/src/util/mod.rs index 6aa578459e2..ca05e49fcd3 100644 --- a/components/tidb_query_executors/src/util/mod.rs +++ b/components/tidb_query_executors/src/util/mod.rs @@ -5,6 +5,7 @@ pub mod hash_aggr_helper; #[cfg(test)] pub mod mock_executor; pub mod scan_executor; +pub mod top_n_heap; use tidb_query_common::Result; use tidb_query_datatype::{codec::batch::LazyBatchColumnVec, expr::EvalContext}; diff --git a/components/tidb_query_executors/src/util/scan_executor.rs b/components/tidb_query_executors/src/util/scan_executor.rs index 935db5dd392..75c7cdc9fe3 100644 --- a/components/tidb_query_executors/src/util/scan_executor.rs +++ b/components/tidb_query_executors/src/util/scan_executor.rs @@ -1,5 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use api_version::{keyspace::KvPair, KvFormat}; use async_trait::async_trait; use kvproto::coprocessor::KeyRange; use tidb_query_common::{ @@ -40,12 +41,12 @@ pub trait ScanExecutorImpl: Send { /// A shared executor implementation for both table scan and index scan. /// Implementation differences between table scan and index scan are further /// given via `ScanExecutorImpl`. -pub struct ScanExecutor { +pub struct ScanExecutor { /// The internal scanning implementation. imp: I, /// The scanner that scans over ranges. - scanner: RangesScanner, + scanner: RangesScanner, /// A flag indicating whether this executor is ended. When table is drained /// or there was an error scanning the table, this flag will be set to @@ -63,7 +64,7 @@ pub struct ScanExecutorOptions { pub is_scanned_range_aware: bool, } -impl ScanExecutor { +impl ScanExecutor { pub fn new( ScanExecutorOptions { imp, @@ -75,7 +76,7 @@ impl ScanExecutor { is_scanned_range_aware, }: ScanExecutorOptions, ) -> Result { - tidb_query_datatype::codec::table::check_table_ranges(&key_ranges)?; + tidb_query_datatype::codec::table::check_table_ranges::(&key_ranges)?; if is_backward { key_ranges.reverse(); } @@ -108,10 +109,11 @@ impl ScanExecutor { for i in 0..scan_rows { let some_row = self.scanner.next_opt(i == scan_rows - 1).await?; - if let Some((key, value)) = some_row { + if let Some(row) = some_row { // Retrieved one row from point range or non-point range. - if let Err(e) = self.imp.process_kv_pair(&key, &value, columns) { + let (key, value) = row.kv(); + if let Err(e) = self.imp.process_kv_pair(key, value, columns) { // When there are errors in `process_kv_pair`, columns' length may not be // identical. For example, the filling process may be partially done so that // first several columns have N rows while the rest have N-1 rows. Since we do @@ -162,7 +164,7 @@ pub fn check_columns_info_supported(columns_info: &[ColumnInfo]) -> Result<()> { } #[async_trait] -impl BatchExecutor for ScanExecutor { +impl BatchExecutor for ScanExecutor { type StorageStats = S::Statistics; #[inline] diff --git a/components/tidb_query_executors/src/util/top_n_heap.rs b/components/tidb_query_executors/src/util/top_n_heap.rs new file mode 100644 index 00000000000..0cbef103e4d --- /dev/null +++ b/components/tidb_query_executors/src/util/top_n_heap.rs @@ -0,0 +1,211 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cmp::Ordering, collections::BinaryHeap, ptr::NonNull, sync::Arc}; + +use tidb_query_common::Result; +use tidb_query_datatype::codec::{ + batch::{LazyBatchColumn, LazyBatchColumnVec}, + data_type::*, +}; +use tidb_query_expr::RpnStackNode; +use tipb::FieldType; + +/// TopNHeap is the common data structure used in TopN-like executors. +pub struct TopNHeap { + /// The maximum number of rows in the heap. + n: usize, + /// The heap. + heap: BinaryHeap, +} + +impl TopNHeap { + /// parameters: + /// - n: The maximum number of rows in the heaps + /// note: to avoid large N causing OOM, the initial capacity will be limited + /// up to 1024. + pub fn new(n: usize) -> Self { + Self { + n, + // Avoid large N causing OOM + heap: BinaryHeap::with_capacity(n.min(1024)), + } + } + + pub fn add_row(&mut self, row: HeapItemUnsafe) -> Result<()> { + if self.heap.len() < self.n { + // HeapItemUnsafe must be checked valid to compare in advance, or else it may + // panic inside BinaryHeap. + row.cmp_sort_key(&row)?; + + // Push into heap when heap is not full. + self.heap.push(row); + } else { + // Swap the greatest row in the heap if this row is smaller than that row. + let mut greatest_row = self.heap.peek_mut().unwrap(); + if row.cmp_sort_key(&greatest_row)? == Ordering::Less { + *greatest_row = row; + } + } + + Ok(()) + } + + #[allow(clippy::clone_on_copy)] + pub fn take_all(&mut self) -> LazyBatchColumnVec { + let heap = std::mem::take(&mut self.heap); + let sorted_items = heap.into_sorted_vec(); + if sorted_items.is_empty() { + return LazyBatchColumnVec::empty(); + } + + let mut result = sorted_items[0] + .source_data + .physical_columns + .clone_empty(sorted_items.len()); + + for (column_index, result_column) in result.as_mut_slice().iter_mut().enumerate() { + match result_column { + LazyBatchColumn::Raw(dest_column) => { + for item in &sorted_items { + let src = item.source_data.physical_columns[column_index].raw(); + dest_column + .push(&src[item.source_data.logical_rows[item.logical_row_index]]); + } + } + LazyBatchColumn::Decoded(dest_vector_value) => { + match_template::match_template! { + TT = [ + Int, + Real, + Duration, + Decimal, + DateTime, + Bytes => BytesRef, + Json => JsonRef, + Enum => EnumRef, + Set => SetRef, + ], + match dest_vector_value { + VectorValue::TT(dest_column) => { + for item in &sorted_items { + let src: &VectorValue = item.source_data.physical_columns[column_index].decoded(); + let src_ref = TT::borrow_vector_value(src); + // TODO: This clone is not necessary. + dest_column.push(src_ref.get_option_ref(item.source_data.logical_rows[item.logical_row_index]).map(|x| x.into_owned_value())); + } + }, + } + } + } + } + } + + result.assert_columns_equal_length(); + result + } +} + +pub struct HeapItemSourceData { + pub physical_columns: LazyBatchColumnVec, + pub logical_rows: Vec, +} + +/// The item in the heap of `BatchTopNExecutor`. +/// +/// WARN: The content of this structure is valid only if `BatchTopNExecutor` is +/// valid (i.e. not dropped). Thus it is called unsafe. +pub struct HeapItemUnsafe { + /// A pointer to the `order_is_desc` field in `BatchTopNExecutor`. + pub order_is_desc_ptr: NonNull<[bool]>, + + /// A pointer to the `order_exprs_field_type` field in `order_exprs`. + pub order_exprs_field_type_ptr: NonNull<[FieldType]>, + + /// The source data that evaluated column in this structure is using. + pub source_data: Arc, + + /// A pointer to the `eval_columns_buffer` field in `BatchTopNExecutor`. + pub eval_columns_buffer_ptr: NonNull>>, + + /// The begin offset of the evaluated columns stored in the buffer. + /// + /// The length of evaluated columns in the buffer is `order_is_desc.len()`. + pub eval_columns_offset: usize, + + /// Which logical row in the evaluated columns this heap item is + /// representing. + pub logical_row_index: usize, +} + +impl HeapItemUnsafe { + fn get_order_is_desc(&self) -> &[bool] { + unsafe { self.order_is_desc_ptr.as_ref() } + } + + fn get_order_exprs_field_type(&self) -> &[FieldType] { + unsafe { self.order_exprs_field_type_ptr.as_ref() } + } + + fn get_eval_columns(&self, len: usize) -> &[RpnStackNode<'_>] { + let offset_begin = self.eval_columns_offset; + let offset_end = offset_begin + len; + let vec_buf = unsafe { self.eval_columns_buffer_ptr.as_ref() }; + &vec_buf[offset_begin..offset_end] + } + + fn cmp_sort_key(&self, other: &Self) -> Result { + // Only debug assert because this function is called pretty frequently. + debug_assert_eq!(self.get_order_is_desc(), other.get_order_is_desc()); + + let order_is_desc = self.get_order_is_desc(); + let order_exprs_field_type = self.get_order_exprs_field_type(); + let columns_len = order_is_desc.len(); + let eval_columns_lhs = self.get_eval_columns(columns_len); + let eval_columns_rhs = other.get_eval_columns(columns_len); + + for column_idx in 0..columns_len { + let lhs_node = &eval_columns_lhs[column_idx]; + let rhs_node = &eval_columns_rhs[column_idx]; + let lhs = lhs_node.get_logical_scalar_ref(self.logical_row_index); + let rhs = rhs_node.get_logical_scalar_ref(other.logical_row_index); + + // There is panic inside, but will never panic, since the data type of + // corresponding column should be consistent for each + // `HeapItemUnsafe`. + let ord = lhs.cmp_sort_key(&rhs, &order_exprs_field_type[column_idx])?; + + if ord == Ordering::Equal { + continue; + } + return if !order_is_desc[column_idx] { + Ok(ord) + } else { + Ok(ord.reverse()) + }; + } + + Ok(Ordering::Equal) + } +} + +/// WARN: HeapItemUnsafe implements partial ordering. It panics when Collator +/// fails to parse. So make sure that it is valid before putting it into a heap. +impl Ord for HeapItemUnsafe { + fn cmp(&self, other: &Self) -> Ordering { + self.cmp_sort_key(other).unwrap() + } +} + +impl PartialOrd for HeapItemUnsafe { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for HeapItemUnsafe { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for HeapItemUnsafe {} diff --git a/components/tidb_query_expr/src/impl_op.rs b/components/tidb_query_expr/src/impl_op.rs index 5289f427e93..665448279fb 100644 --- a/components/tidb_query_expr/src/impl_op.rs +++ b/components/tidb_query_expr/src/impl_op.rs @@ -55,6 +55,18 @@ pub fn unary_not_decimal(arg: Option<&Decimal>) -> Result> { Ok(arg.as_ref().map(|v| v.is_zero() as i64)) } +#[rpn_fn(nullable)] +#[inline] +pub fn unary_not_json(arg: Option) -> Result> { + let json_zero = Json::from_i64(0).unwrap(); + Ok(arg.as_ref().map(|v| { + if v == &json_zero.as_ref() { + return 1; + } + 0 + })) +} + #[rpn_fn(nullable)] #[inline] pub fn unary_minus_uint(arg: Option<&Int>) -> Result> { @@ -383,6 +395,26 @@ mod tests { } } + #[test] + fn test_unary_not_json() { + let test_cases = vec![ + (None, None), + (Some(Json::from_i64(0).unwrap()), Some(1)), + (Some(Json::from_i64(1).unwrap()), Some(0)), + ( + Some(Json::from_array(vec![Json::from_i64(0).unwrap()]).unwrap()), + Some(0), + ), + ]; + for (arg, expect_output) in test_cases { + let output = RpnFnScalarEvaluator::new() + .push_param(arg.clone()) + .evaluate(ScalarFuncSig::UnaryNotJson) + .unwrap(); + assert_eq!(output, expect_output, "{:?}", arg.as_ref()); + } + } + #[test] fn test_unary_minus_int() { let unsigned_test_cases = vec![ diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index 43b0602ebbb..649a7cfa1c8 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -732,6 +732,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::UnaryNotInt => unary_not_int_fn_meta(), ScalarFuncSig::UnaryNotReal => unary_not_real_fn_meta(), ScalarFuncSig::UnaryNotDecimal => unary_not_decimal_fn_meta(), + ScalarFuncSig::UnaryNotJson => unary_not_json_fn_meta(), ScalarFuncSig::UnaryMinusInt => map_unary_minus_int_func(value, children)?, ScalarFuncSig::UnaryMinusReal => unary_minus_real_fn_meta(), ScalarFuncSig::UnaryMinusDecimal => unary_minus_decimal_fn_meta(), diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 5af54ee61b6..02bfc1c9c55 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -288,8 +288,9 @@ impl WriteEvent { pub struct SnapContext<'a> { pub pb_ctx: &'a Context, pub read_id: Option, - // When start_ts is None and `stale_read` is true, it means acquire a snapshot without any - // consistency guarantee. + // When `start_ts` is None and `stale_read` is true, it means acquire a snapshot without any + // consistency guarantee. This filed is also used to check if a read is allowed in the + // flashback. pub start_ts: Option, // `key_ranges` is used in replica read. It will send to // the leader via raft "read index" to check memory locks. @@ -418,7 +419,7 @@ pub trait Engine: Send + Clone + 'static { /// Mark the start of flashback. // It's an infrequent API, use trait object for simplicity. - fn start_flashback(&self, _ctx: &Context) -> BoxFuture<'static, Result<()>> { + fn start_flashback(&self, _ctx: &Context, _start_ts: u64) -> BoxFuture<'static, Result<()>> { Box::pin(futures::future::ready(Ok(()))) } diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 663eb2b681f..1193751b228 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -19,6 +19,7 @@ collections = { workspace = true } cpu-time = "1.0.0" crc32fast = "1.2" crossbeam = "0.8" +crossbeam-skiplist = "0.1" derive_more = "0.99.3" error_code = { workspace = true } fail = "0.5" @@ -37,6 +38,7 @@ num-traits = "0.2" num_cpus = "1" online_config = { workspace = true } openssl = "0.10" +parking_lot_core = "0.9.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" diff --git a/components/tikv_util/src/log.rs b/components/tikv_util/src/log.rs index 10facfa2287..fd351eecbd4 100644 --- a/components/tikv_util/src/log.rs +++ b/components/tikv_util/src/log.rs @@ -82,3 +82,145 @@ macro_rules! debug(($($args:tt)+) => { macro_rules! trace(($($args:tt)+) => { ::slog_global::trace!($($args)+) };); + +use std::fmt::{self, Display, Write}; + +use slog::{BorrowedKV, OwnedKVList, Record, KV}; + +struct FormatKeyValueList<'a, W> { + buffer: &'a mut W, + written: bool, +} + +impl<'a, W: Write> slog::Serializer for FormatKeyValueList<'a, W> { + fn emit_arguments(&mut self, key: slog::Key, val: &fmt::Arguments<'_>) -> slog::Result { + if !self.written { + write!(&mut self.buffer, "[{}={}]", key, val).unwrap(); + self.written = true; + } else { + write!(&mut self.buffer, " [{}={}]", key, val).unwrap() + } + Ok(()) + } +} + +/// A helper struct to format the key-value list of a slog logger. It's not +/// exact the same format as `TiKVFormat` and etc. It's just a simple +/// implementation for panic, return errors that doesn't show in normal logs +/// processing. +pub struct SlogFormat<'a>(pub &'a slog::Logger); + +impl<'a> Display for SlogFormat<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut formatter = FormatKeyValueList { + buffer: f, + written: false, + }; + let record = slog::record_static!(slog::Level::Trace, ""); + self.0 + .list() + .serialize( + &Record::new(&record, &format_args!(""), slog::b!()), + &mut formatter, + ) + .unwrap(); + Ok(()) + } +} + +#[doc(hidden)] +pub fn format_kv_list(buffer: &mut String, kv_list: &OwnedKVList, borrow_list: BorrowedKV<'_>) { + let mut formatter = FormatKeyValueList { + buffer, + written: false, + }; + let record = slog::record_static!(slog::Level::Trace, ""); + let args = format_args!(""); + let record = Record::new(&record, &args, slog::b!()); + // Serialize borrow list first to make region_id, peer_id at the end. + borrow_list.serialize(&record, &mut formatter).unwrap(); + kv_list.serialize(&record, &mut formatter).unwrap(); +} + +/// A helper macro to panic with the key-value list of a slog logger. +/// +/// Similar to `SlogFormat`, but just panic. +#[macro_export] +macro_rules! slog_panic { + ($logger:expr, $msg:expr, $borrowed_kv:expr) => {{ + let owned_kv = ($logger).list(); + let mut s = String::new(); + $crate::log::format_kv_list(&mut s, &owned_kv, $borrowed_kv); + if s.is_empty() { + panic!("{}", $msg) + } else { + panic!("{} {}", $msg, s) + } + }}; + ($logger:expr, $msg:expr) => {{ + $crate::slog_panic!($logger, $msg, slog::b!()) + }}; + ($logger:expr, $msg:expr; $($arg:tt)+) => {{ + $crate::slog_panic!($logger, $msg, slog::b!($($arg)+)) + }}; +} + +#[cfg(test)] +mod tests { + #[test] + fn test_format_kv() { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, String::new()); + + let logger = logger.new(slog::o!("a" => 1)); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, "[a=1]"); + + let logger = logger.new(slog::o!("b" => 2)); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, "[b=2] [a=1]"); + } + + #[test] + fn test_slog_panic() { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test"); + + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"; "k" => "v"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test [k=v]"); + + let logger = logger.new(slog::o!("a" => 1)); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test [a=1]"); + + let logger = logger.new(slog::o!("b" => 2)); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!( + err.downcast::().unwrap().as_str(), + "test [b=2] [a=1]" + ); + + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"; "k" => "v"); + }) + .unwrap_err(); + assert_eq!( + err.downcast::().unwrap().as_str(), + "test [k=v] [b=2] [a=1]" + ); + } +} diff --git a/components/tikv_util/src/mpsc/mod.rs b/components/tikv_util/src/mpsc/mod.rs index 45249fed9bc..700691f1189 100644 --- a/components/tikv_util/src/mpsc/mod.rs +++ b/components/tikv_util/src/mpsc/mod.rs @@ -3,7 +3,9 @@ //! This module provides an implementation of mpsc channel based on //! crossbeam_channel. Comparing to the crossbeam_channel, this implementation //! supports closed detection and try operations. + pub mod future; +pub mod priority_queue; use std::{ cell::Cell, diff --git a/components/tikv_util/src/mpsc/priority_queue.rs b/components/tikv_util/src/mpsc/priority_queue.rs new file mode 100644 index 00000000000..fac741361db --- /dev/null +++ b/components/tikv_util/src/mpsc/priority_queue.rs @@ -0,0 +1,303 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{ + atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering}, + Arc, +}; + +use crossbeam::channel::{RecvError, SendError, TryRecvError, TrySendError}; +use crossbeam_skiplist::SkipMap; +use parking_lot_core::{ + park, unpark_all, unpark_one, SpinWait, DEFAULT_PARK_TOKEN, DEFAULT_UNPARK_TOKEN, +}; + +// Create a priority based channel. Sender can send message with priority of +// u64, and receiver will receive messages in ascending order of priority. For +// two messages of same priority, the receiving order follows FIFO. +pub fn unbounded() -> (Sender, Receiver) { + let queue = Arc::new(PriorityQueue::new()); + let sender = Sender { + inner: queue.clone(), + }; + let receiver = Receiver { inner: queue }; + (sender, receiver) +} + +struct Cell { + ptr: AtomicPtr, +} + +unsafe impl Send for Cell {} +unsafe impl Sync for Cell {} + +impl Cell { + fn new(value: T) -> Self { + Self { + ptr: AtomicPtr::new(Box::into_raw(Box::new(value))), + } + } + + fn take(&self) -> Option { + let p = self.ptr.swap(std::ptr::null_mut(), Ordering::SeqCst); + if !p.is_null() { + unsafe { Some(*Box::from_raw(p)) } + } else { + None + } + } +} + +impl Drop for Cell { + fn drop(&mut self) { + self.take(); + } +} + +#[derive(Default)] +struct PriorityQueue { + queue: SkipMap>, + + sequencer: AtomicU64, + + senders: AtomicUsize, + receivers: AtomicUsize, +} + +impl PriorityQueue { + pub fn new() -> Self { + Self { + queue: SkipMap::new(), + sequencer: AtomicU64::new(0), + senders: AtomicUsize::new(1), + receivers: AtomicUsize::new(1), + } + } + + pub fn get_map_key(&self, pri: u64) -> MapKey { + MapKey { + priority: pri, + sequence: self.sequencer.fetch_add(1, Ordering::Relaxed), + } + } + + fn is_disconnected(&self) -> bool { + self.senders.load(Ordering::SeqCst) == 0 + } +} + +// When derived `PartialOrd` on structs, it will produce a lexicographic +// ordering based on the top-to-bottom declaration order of the struct’s +// members. +#[derive(Eq, PartialEq, Ord, PartialOrd)] +struct MapKey { + priority: u64, + sequence: u64, +} + +pub struct Sender { + inner: Arc>, +} + +impl Sender { + pub fn try_send(&self, msg: T, pri: u64) -> Result<(), TrySendError> { + self.send(msg, pri) + .map_err(|SendError(msg)| TrySendError::Disconnected(msg)) + } + + pub fn send(&self, msg: T, pri: u64) -> Result<(), SendError> { + if self.inner.receivers.load(Ordering::Acquire) == 0 { + return Err(SendError(msg)); + } + self.inner + .queue + .insert(self.inner.get_map_key(pri), Cell::new(msg)); + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + unpark_one(addr, |_| DEFAULT_UNPARK_TOKEN); + } + Ok(()) + } + + #[cfg(test)] + fn len(&self) -> usize { + self.inner.queue.len() + } +} + +impl Clone for Sender { + fn clone(&self) -> Self { + self.inner.senders.fetch_add(1, Ordering::AcqRel); + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl Drop for Sender { + fn drop(&mut self) { + let old = self.inner.senders.fetch_sub(1, Ordering::AcqRel); + if old <= 1 { + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + unpark_all(addr, DEFAULT_UNPARK_TOKEN); + } + } + } +} + +pub struct Receiver { + inner: Arc>, +} + +impl Receiver { + pub fn try_recv(&self) -> Result { + match self.inner.queue.pop_front() { + Some(entry) => Ok(entry.value().take().unwrap()), + None if self.inner.is_disconnected() => Err(TryRecvError::Disconnected), + None => Err(TryRecvError::Empty), + } + } + + pub fn recv(&self) -> Result { + let mut spin = SpinWait::new(); + loop { + match self.try_recv() { + Ok(msg) => return Ok(msg), + Err(TryRecvError::Disconnected) => { + return Err(RecvError); + } + Err(TryRecvError::Empty) => { + if spin.spin() { + continue; + } + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + park( + addr, + || self.len() == 0 && !self.inner.is_disconnected(), + || {}, + |_, _| {}, + DEFAULT_PARK_TOKEN, + None, + ); + } + } + } + } + } + + fn len(&self) -> usize { + self.inner.queue.len() + } +} + +impl Clone for Receiver { + fn clone(&self) -> Self { + self.inner.receivers.fetch_add(1, Ordering::AcqRel); + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl Drop for Receiver { + fn drop(&mut self) { + self.inner.receivers.fetch_sub(1, Ordering::AcqRel); + } +} + +#[cfg(test)] +mod tests { + use std::{sync::atomic::AtomicU64, thread, time::Duration}; + + use crossbeam::channel::TrySendError; + use rand::Rng; + + use super::*; + + #[test] + fn test_priority() { + let (tx, rx) = super::unbounded::(); + tx.try_send(1, 2).unwrap(); + tx.send(2, 1).unwrap(); + tx.send(3, 3).unwrap(); + + assert_eq!(rx.try_recv(), Ok(2)); + assert_eq!(rx.recv(), Ok(1)); + assert_eq!(rx.recv(), Ok(3)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Empty)); + + drop(rx); + assert_eq!(tx.send(2, 1), Err(SendError(2))); + assert_eq!(tx.try_send(2, 1), Err(TrySendError::Disconnected(2))); + + let (tx, rx) = super::unbounded::(); + drop(tx); + assert_eq!(rx.recv(), Err(RecvError)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Disconnected)); + + let (tx, rx) = super::unbounded::(); + thread::spawn(move || { + thread::sleep(Duration::from_millis(100)); + tx.send(10, 1).unwrap(); + }); + assert_eq!(rx.recv(), Ok(10)); + + let (tx, rx) = super::unbounded::(); + assert_eq!(tx.len(), 0); + assert_eq!(rx.len(), 0); + tx.send(2, 1).unwrap(); + tx.send(3, 2).unwrap(); + assert_eq!(tx.len(), 2); + assert_eq!(rx.len(), 2); + drop(tx); + assert_eq!(rx.try_recv(), Ok(2)); + assert_eq!(rx.recv(), Ok(3)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Disconnected)); + assert_eq!(rx.recv(), Err(RecvError)); + } + + #[test] + fn test_priority_multi_thread() { + let (tx, rx) = super::unbounded::(); + + let mut handlers = Vec::with_capacity(10); + let expected_count = Arc::new(AtomicU64::new(0)); + let real_counter = Arc::new(AtomicU64::new(0)); + for _ in 0..10 { + let sender = tx.clone(); + let expected_count = expected_count.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + let pri = rng.gen_range(0..1000); + let mut cnt = 0; + for i in 0..1000 { + sender.send(i, pri).unwrap(); + cnt += i; + } + expected_count.fetch_add(cnt, Ordering::Relaxed); + }); + handlers.push(handle); + } + for _i in 0..10 { + let recv = rx.clone(); + let real_counter = real_counter.clone(); + let handle = thread::spawn(move || { + let mut cnt = 0; + while let Ok(v) = recv.recv() { + cnt += v; + } + real_counter.fetch_add(cnt, Ordering::Relaxed); + }); + handlers.push(handle); + } + drop(tx); + for h in handlers { + h.join().unwrap(); + } + assert_eq!( + expected_count.load(Ordering::Relaxed), + real_counter.load(Ordering::Relaxed) + ); + } +} diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 49e6812b81f..797da2aea54 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -22,7 +22,7 @@ use mnt::get_mount; use sysinfo::RefreshKind; pub use sysinfo::{CpuExt, DiskExt, NetworkExt, ProcessExt, SystemExt}; -use crate::config::{ReadableSize, KIB}; +use crate::config::ReadableSize; pub const HIGH_PRI: i32 = -1; const CPU_CORES_QUOTA_ENV_VAR_KEY: &str = "TIKV_CPU_CORES_QUOTA"; @@ -93,7 +93,7 @@ impl SysQuota { fn sysinfo_memory_limit_in_bytes() -> u64 { let system = sysinfo::System::new_with_specifics(RefreshKind::new().with_memory()); - system.total_memory() * KIB + system.total_memory() } } diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index e761fac8bb5..26dbf495f54 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -405,6 +405,13 @@ impl Worker { }); } + pub fn spawn_async_task(&self, f: F) + where + F: Future + Send + 'static, + { + self.remote.spawn(f); + } + fn delay_notify(tx: UnboundedSender>, timeout: Duration) { let now = Instant::now(); let f = GLOBAL_TIMER_HANDLE diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 9de2d49cb07..f010b508aaa 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -15,7 +15,7 @@ use fail::fail_point; use futures::channel::oneshot::{self, Canceled}; use prometheus::{IntCounter, IntGauge}; use tracker::TrackedFuture; -use yatp::task::future; +use yatp::{queue::Extras, task::future}; pub type ThreadPool = yatp::ThreadPool; @@ -28,6 +28,8 @@ struct Env { } #[derive(Clone)] +// FuturePool wraps a yatp thread pool providing task count metrics and gate +// maximum running tasks. pub struct FuturePool { inner: Arc, } @@ -82,7 +84,14 @@ impl FuturePool { where F: Future + Send + 'static, { - self.inner.spawn(TrackedFuture::new(future)) + self.inner.spawn(TrackedFuture::new(future), None) + } + + pub fn spawn_with_extras(&self, future: F, extras: Extras) -> Result<(), Full> + where + F: Future + Send + 'static, + { + self.inner.spawn(TrackedFuture::new(future), Some(extras)) } /// Spawns a future in the pool and returns a handle to the result of the @@ -143,7 +152,7 @@ impl PoolInner { } } - fn spawn(&self, future: F) -> Result<(), Full> + fn spawn(&self, future: F, extras: Option) -> Result<(), Full> where F: Future + Send + 'static, { @@ -154,11 +163,17 @@ impl PoolInner { metrics_running_task_count.inc(); - self.pool.spawn(async move { + let f = async move { let _ = future.await; metrics_handled_task_count.inc(); metrics_running_task_count.dec(); - }); + }; + + if let Some(extras) = extras { + self.pool.spawn(future::TaskCell::new(f, extras)); + } else { + self.pool.spawn(f); + } Ok(()) } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 6e246d6cddf..305d2162482 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -10,7 +10,7 @@ pub use future_pool::{Full, FuturePool}; use prometheus::{local::LocalHistogram, Histogram}; use yatp::{ pool::{CloneRunnerBuilder, Local, Runner}, - queue::{multilevel, QueueType, TaskCell as _}, + queue::{multilevel, priority, QueueType, TaskCell as _}, task::future::{Runner as FutureRunner, TaskCell}, ThreadPool, }; @@ -198,42 +198,42 @@ impl YatpPoolBuilder { } } - pub fn config(&mut self, config: Config) -> &mut Self { + pub fn config(self, config: Config) -> Self { // TODO: maybe we should use (1, num_cpu) for min and max thread count. self.thread_count(config.workers, config.workers, config.workers) .stack_size(config.stack_size) .max_tasks(config.workers.saturating_mul(config.max_tasks_per_worker)) } - pub fn stack_size(&mut self, val: usize) -> &mut Self { + pub fn stack_size(mut self, val: usize) -> Self { self.stack_size = val; self } - pub fn name_prefix(&mut self, val: impl Into) -> &mut Self { + pub fn name_prefix(mut self, val: impl Into) -> Self { let name = val.into(); self.name_prefix = Some(name); self } pub fn thread_count( - &mut self, + mut self, min_thread_count: usize, core_thread_count: usize, max_thread_count: usize, - ) -> &mut Self { + ) -> Self { self.min_thread_count = min_thread_count; self.core_thread_count = core_thread_count; self.max_thread_count = max_thread_count; self } - pub fn max_tasks(&mut self, tasks: usize) -> &mut Self { + pub fn max_tasks(mut self, tasks: usize) -> Self { self.max_tasks = tasks; self } - pub fn before_stop(&mut self, f: F) -> &mut Self + pub fn before_stop(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -241,7 +241,7 @@ impl YatpPoolBuilder { self } - pub fn after_start(&mut self, f: F) -> &mut Self + pub fn after_start(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -249,7 +249,7 @@ impl YatpPoolBuilder { self } - pub fn before_pause(&mut self, f: F) -> &mut Self + pub fn before_pause(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -257,13 +257,32 @@ impl YatpPoolBuilder { self } - pub fn build_future_pool(&mut self) -> FuturePool { + pub fn build_future_pool(self) -> FuturePool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); + let size = self.core_thread_count; + let task = self.max_tasks; let pool = self.build_single_level_pool(); - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); - FuturePool::from_pool(pool, name, self.core_thread_count, self.max_tasks) + FuturePool::from_pool(pool, &name, size, task) + } + + pub fn build_priority_future_pool( + self, + priority_provider: Arc, + ) -> FuturePool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); + let size = self.core_thread_count; + let task = self.max_tasks; + let pool = self.build_priority_pool(priority_provider); + FuturePool::from_pool(pool, &name, size, task) } - pub fn build_single_level_pool(&mut self) -> ThreadPool { + pub fn build_single_level_pool(self) -> ThreadPool { let (builder, runner) = self.create_builder(); builder.build_with_queue_and_runner( yatp::queue::QueueType::SingleLevel, @@ -271,9 +290,12 @@ impl YatpPoolBuilder { ) } - pub fn build_multi_level_pool(&mut self) -> ThreadPool { + pub fn build_multi_level_pool(self) -> ThreadPool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); let (builder, read_pool_runner) = self.create_builder(); - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); let multilevel_builder = multilevel::Builder::new(multilevel::Config::default().name(Some(name))); let runner_builder = @@ -282,8 +304,25 @@ impl YatpPoolBuilder { .build_with_queue_and_runner(QueueType::Multilevel(multilevel_builder), runner_builder) } - fn create_builder(&mut self) -> (yatp::Builder, YatpPoolRunner) { - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); + pub fn build_priority_pool( + self, + priority_provider: Arc, + ) -> ThreadPool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); + let (builder, read_pool_runner) = self.create_builder(); + let priority_builder = priority::Builder::new( + priority::Config::default().name(Some(name)), + priority_provider, + ); + let runner_builder = priority_builder.runner_builder(CloneRunnerBuilder(read_pool_runner)); + builder.build_with_queue_and_runner(QueueType::Priority(priority_builder), runner_builder) + } + + fn create_builder(mut self) -> (yatp::Builder, YatpPoolRunner) { + let name = self.name_prefix.unwrap_or_else(|| "yatp_pool".to_string()); let mut builder = yatp::Builder::new(thd_name!(name)); builder .stack_size(self.stack_size) @@ -295,7 +334,7 @@ impl YatpPoolBuilder { let before_stop = self.before_stop.take(); let before_pause = self.before_pause.take(); let schedule_wait_duration = - metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[&name]); let read_pool_runner = YatpPoolRunner::new( Default::default(), self.ticker.clone(), diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 60e64bf444a..15779df426a 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -192,6 +192,16 @@ impl Key { Ok(number::decode_u64_desc(&mut ts)?.into()) } + /// Decode the timestamp from a ts encoded key and return in bytes. + #[inline] + pub fn decode_ts_bytes_from(key: &[u8]) -> Result<&[u8], codec::Error> { + let len = key.len(); + if len < number::U64_SIZE { + return Err(codec::Error::KeyLength); + } + Ok(&key[key.len() - number::U64_SIZE..]) + } + /// Whether the user key part of a ts encoded key `ts_encoded_key` equals to /// the encoded user key `user_key`. /// diff --git a/engine_store_ffi/Cargo.toml b/engine_store_ffi/Cargo.toml index 4fef4837a8b..f24a69afae2 100644 --- a/engine_store_ffi/Cargo.toml +++ b/engine_store_ffi/Cargo.toml @@ -22,6 +22,7 @@ test-engines-rocksdb = [ test-engines-panic = [ "engine_test/test-engines-panic", ] +enable-pagestorage = [] cloud-aws = ["sst_importer/cloud-aws"] cloud-gcp = ["sst_importer/cloud-gcp"] @@ -62,6 +63,7 @@ online_config = { workspace = true } ordered-float = "2.6" parking_lot = "0.12" pd_client = { workspace = true, default-features = false } +portable-atomic = "0.3" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } diff --git a/engine_store_ffi/src/ffihub_impl.rs b/engine_store_ffi/src/ffihub_impl.rs new file mode 100644 index 00000000000..d6569756da7 --- /dev/null +++ b/engine_store_ffi/src/ffihub_impl.rs @@ -0,0 +1,110 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use engine_tiflash::{FsStatsExt, RawPSWriteBatchPtr, RawPSWriteBatchWrapper}; + +use crate::{ + interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, PageAndCppStrWithView, + RawCppPtr, +}; + +pub struct TiFlashFFIHub { + pub engine_store_server_helper: &'static EngineStoreServerHelper, +} +unsafe impl Send for TiFlashFFIHub {} +unsafe impl Sync for TiFlashFFIHub {} +impl engine_tiflash::FFIHubInner for TiFlashFFIHub { + fn get_store_stats(&self) -> engine_tiflash::FsStatsExt { + self.engine_store_server_helper + .handle_compute_store_stats() + .into() + } + + fn create_write_batch(&self) -> RawPSWriteBatchWrapper { + self.engine_store_server_helper.create_write_batch().into() + } + + fn destroy_write_batch(&self, wb_wrapper: &RawPSWriteBatchWrapper) { + self.engine_store_server_helper + .gc_raw_cpp_ptr(wb_wrapper.ptr, wb_wrapper.type_); + } + + fn consume_write_batch(&self, wb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.consume_write_batch(wb) + } + + fn write_batch_size(&self, wb: RawPSWriteBatchPtr) -> usize { + self.engine_store_server_helper.write_batch_size(wb) as usize + } + + fn write_batch_is_empty(&self, wb: RawPSWriteBatchPtr) -> bool { + self.engine_store_server_helper.write_batch_is_empty(wb) != 0 + } + + fn write_batch_merge(&self, lwb: RawPSWriteBatchPtr, rwb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.write_batch_merge(lwb, rwb) + } + + fn write_batch_clear(&self, wb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.write_batch_clear(wb) + } + + fn write_batch_put_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8], page: &[u8]) { + self.engine_store_server_helper + .write_batch_put_page(wb, page_id.into(), page.into()) + } + + fn write_batch_del_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8]) { + self.engine_store_server_helper + .write_batch_del_page(wb, page_id.into()) + } + + fn read_page(&self, page_id: &[u8]) -> Option> { + // TODO maybe we can steal memory from C++ here to reduce redundant copy? + let value = self.engine_store_server_helper.read_page(page_id.into()); + return if value.view.len == 0 { + None + } else { + Some(value.view.to_slice().to_vec()) + }; + } + + fn scan_page( + &self, + start_page_id: &[u8], + end_page_id: &[u8], + f: &mut dyn FnMut(&[u8], &[u8]) -> engine_traits::Result, + ) { + let values = self + .engine_store_server_helper + .scan_page(start_page_id.into(), end_page_id.into()); + let arr = values.inner as *mut PageAndCppStrWithView; + for i in 0..values.len { + let value = unsafe { &*arr.offset(i as isize) }; + if value.page_view.len != 0 { + f(value.key_view.to_slice(), value.page_view.to_slice()).unwrap(); + } + } + } +} + +impl From for RawPSWriteBatchWrapper { + fn from(src: RawCppPtr) -> Self { + let result = RawPSWriteBatchWrapper { + ptr: src.ptr, + type_: src.type_, + }; + let mut src = src; + src.ptr = std::ptr::null_mut(); + result + } +} + +#[allow(clippy::from_over_into)] +impl Into for ffi_interfaces::StoreStats { + fn into(self) -> FsStatsExt { + FsStatsExt { + available: self.fs_stats.avail_size, + capacity: self.fs_stats.capacity_size, + used: self.fs_stats.used_size, + } + } +} diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index c7633f6010c..24e7db30543 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -51,6 +51,13 @@ pub mod root { } #[repr(u32)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] + pub enum SpecialCppPtrType { + None = 0, + TupleOfRawCppPtr = 1, + ArrayOfRawCppPtr = 2, + } + #[repr(u32)] + #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] pub enum EngineStoreApplyRes { None = 0, Persist = 1, @@ -136,6 +143,34 @@ pub mod root { pub inner: root::DB::RawCppPtr, pub view: root::DB::BaseBuffView, } + #[repr(C)] + #[derive(Debug)] + pub struct PageAndCppStrWithView { + pub page: root::DB::RawCppPtr, + pub key: root::DB::RawCppPtr, + pub page_view: root::DB::BaseBuffView, + pub key_view: root::DB::BaseBuffView, + } + #[repr(C)] + #[derive(Debug)] + pub struct RawCppPtrCarr { + pub inner: root::DB::RawVoidPtr, + pub len: u64, + pub type_: root::DB::RawCppPtrType, + } + #[repr(C)] + #[derive(Debug)] + pub struct RawCppPtrTuple { + pub inner: *mut root::DB::RawCppPtr, + pub len: u64, + } + #[repr(C)] + #[derive(Debug)] + pub struct RawCppPtrArr { + pub inner: *mut root::DB::RawVoidPtr, + pub len: u64, + pub type_: root::DB::RawCppPtrType, + } #[repr(u8)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] pub enum HttpRequestStatus { @@ -230,6 +265,23 @@ pub mod root { Error = 1, NotFound = 2, } + #[repr(u32)] + #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] + pub enum FastAddPeerStatus { + Ok = 0, + WaitForData = 1, + OtherError = 2, + NoSuitable = 3, + BadData = 4, + FailedInject = 5, + } + #[repr(C)] + #[derive(Debug)] + pub struct FastAddPeerRes { + pub status: root::DB::FastAddPeerStatus, + pub apply_state: root::DB::CppStrWithView, + pub region: root::DB::CppStrWithView, + } #[repr(C)] #[derive(Debug)] pub struct RaftStoreProxyFFIHelper { @@ -368,6 +420,61 @@ pub mod root { arg5: u64, ) -> u8, >, + pub fn_create_write_batch: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + ) -> root::DB::RawCppPtr, + >, + pub fn_write_batch_put_page: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + arg2: root::DB::BaseBuffView, + arg3: root::DB::BaseBuffView, + ), + >, + pub fn_write_batch_del_page: ::std::option::Option< + unsafe extern "C" fn(arg1: root::DB::RawVoidPtr, arg2: root::DB::BaseBuffView), + >, + pub fn_write_batch_size: + ::std::option::Option u64>, + pub fn_write_batch_is_empty: + ::std::option::Option u8>, + pub fn_write_batch_merge: ::std::option::Option< + unsafe extern "C" fn(arg1: root::DB::RawVoidPtr, arg2: root::DB::RawVoidPtr), + >, + pub fn_write_batch_clear: + ::std::option::Option, + pub fn_consume_write_batch: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + arg2: root::DB::RawVoidPtr, + ), + >, + pub fn_handle_read_page: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + arg2: root::DB::BaseBuffView, + ) -> root::DB::CppStrWithView, + >, + pub fn_handle_scan_page: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + arg2: root::DB::BaseBuffView, + arg3: root::DB::BaseBuffView, + ) -> root::DB::RawCppPtrCarr, + >, + pub fn_handle_purge_pagestorage: ::std::option::Option< + unsafe extern "C" fn(arg1: *const root::DB::EngineStoreServerWrap), + >, + pub fn_handle_seek_ps_key: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + arg2: root::DB::BaseBuffView, + ) -> root::DB::CppStrWithView, + >, + pub fn_ps_is_empty: ::std::option::Option< + unsafe extern "C" fn(arg1: *const root::DB::EngineStoreServerWrap) -> u8, + >, pub fn_atomic_update_proxy: ::std::option::Option< unsafe extern "C" fn( arg1: *mut root::DB::EngineStoreServerWrap, @@ -424,6 +531,20 @@ pub mod root { pub fn_gc_raw_cpp_ptr: ::std::option::Option< unsafe extern "C" fn(arg1: root::DB::RawVoidPtr, arg2: root::DB::RawCppPtrType), >, + pub fn_gc_raw_cpp_ptr_carr: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + arg2: root::DB::RawCppPtrType, + arg3: u64, + ), + >, + pub fn_gc_special_raw_cpp_ptr: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + arg2: u64, + arg3: root::DB::SpecialCppPtrType, + ), + >, pub fn_get_config: ::std::option::Option< unsafe extern "C" fn( arg1: *mut root::DB::EngineStoreServerWrap, @@ -451,8 +572,15 @@ pub mod root { leader_safe_ts: u64, ), >, + pub fn_fast_add_peer: ::std::option::Option< + unsafe extern "C" fn( + arg1: *mut root::DB::EngineStoreServerWrap, + region_id: u64, + new_peer_id: u64, + ) -> root::DB::FastAddPeerRes, + >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 15776819379826780689; + pub const RAFT_STORE_PROXY_VERSION: u64 = 17394545035928865111; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 41ec9d6a8ff..fb2ce037590 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(drain_filter)] +#![feature(let_chains)] #[allow(dead_code)] pub mod interfaces; @@ -7,8 +8,10 @@ pub mod interfaces; pub mod basic_ffi_impls; pub mod domain_impls; pub mod encryption_impls; +pub mod ffihub_impl; mod lock_cf_reader; pub mod observer; +pub mod ps_engine; mod read_index_helper; pub mod sst_reader_impls; mod utils; @@ -26,7 +29,9 @@ pub use basic_ffi_impls::*; pub use domain_impls::*; use encryption::DataKeyManager; pub use encryption_impls::*; +pub use engine_tiflash::EngineStoreConfig; use engine_traits::{Peekable, CF_LOCK}; +pub use ffihub_impl::TiFlashFFIHub; use kvproto::{kvrpcpb, metapb, raft_cmdpb}; use lazy_static::lazy_static; use protobuf::Message; @@ -34,10 +39,12 @@ pub use read_index_helper::ReadIndexClient; pub use sst_reader_impls::*; pub use self::interfaces::root::DB::{ - BaseBuffView, ColumnFamilyType, CppStrVecView, EngineStoreApplyRes, EngineStoreServerHelper, - EngineStoreServerStatus, FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, - KVGetStatus, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, - RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, + BaseBuffView, ColumnFamilyType, CppStrVecView, CppStrWithView, EngineStoreApplyRes, + EngineStoreServerHelper, EngineStoreServerStatus, FastAddPeerRes, FastAddPeerStatus, + FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, + PageAndCppStrWithView, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, + RawCppPtrArr, RawCppPtrCarr, RawCppPtrTuple, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, + SpecialCppPtrType, StoreStats, WriteCmdType, WriteCmdsView, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, SSTReaderInterfaces, SSTView, @@ -350,7 +357,7 @@ impl RaftStoreProxyFFIHelper { } impl RawCppPtr { - fn into_raw(mut self) -> RawVoidPtr { + pub fn into_raw(mut self) -> RawVoidPtr { let ptr = self.ptr; self.ptr = std::ptr::null_mut(); ptr @@ -375,13 +382,111 @@ impl Drop for RawCppPtr { } } +impl RawCppPtrTuple { + pub fn is_null(&self) -> bool { + unsafe { (*self.inner).ptr.is_null() } + } +} + +unsafe impl Send for RawCppPtrTuple {} + +impl Drop for RawCppPtrTuple { + fn drop(&mut self) { + // Note the layout is: + // [0] RawCppPtr to T + // [1] RawCppPtr to R + // ... + // [len-1] RawCppPtr to S + unsafe { + if !self.is_null() { + let helper = get_engine_store_server_helper(); + let len = self.len; + // Delete all `void *`. + for i in 0..len { + let i = i as usize; + let inner_i = self.inner.add(i); + // Will not fire even without the if in tests, + // since type must be 0 which is None. + if !inner_i.is_null() { + helper.gc_raw_cpp_ptr((*inner_i).ptr, (*inner_i).type_); + // We still set to nullptr, even though we will immediately delete it. + (*inner_i).ptr = std::ptr::null_mut(); + } + } + // Delete `void **`. + helper.gc_special_raw_cpp_ptr( + self.inner as RawVoidPtr, + self.len, + SpecialCppPtrType::TupleOfRawCppPtr, + ); + self.inner = std::ptr::null_mut(); + self.len = 0; + } + } + } +} + +impl RawCppPtrArr { + pub fn is_null(&self) -> bool { + self.inner.is_null() + } +} + +unsafe impl Send for RawCppPtrArr {} + +impl Drop for RawCppPtrArr { + fn drop(&mut self) { + // Note the layout is: + // [0] RawVoidPtr to T + // [1] RawVoidPtr + // ... + // [len-1] RawVoidPtr + unsafe { + if !self.is_null() { + let helper = get_engine_store_server_helper(); + let len = self.len; + // Delete all `T *` + for i in 0..len { + let i = i as usize; + let inner_i = self.inner.add(i); + // Will fire even without the if in tests, since type is not 0. + if !(*inner_i).is_null() { + helper.gc_raw_cpp_ptr(*inner_i, self.type_); + // We still set to nullptr, even though we will immediately delete it. + *inner_i = std::ptr::null_mut(); + } + } + // Delete `T **` + helper.gc_special_raw_cpp_ptr( + self.inner as RawVoidPtr, + self.len, + SpecialCppPtrType::ArrayOfRawCppPtr, + ); + self.inner = std::ptr::null_mut(); + self.len = 0; + } + } + } +} + +impl Drop for RawCppPtrCarr { + fn drop(&mut self) { + if !self.inner.is_null() { + let helper = get_engine_store_server_helper(); + helper.gc_raw_cpp_ptr_carr(self.inner as RawVoidPtr, self.type_, self.len); + self.inner = std::ptr::null_mut(); + self.len = 0; + } + } +} + static mut ENGINE_STORE_SERVER_HELPER_PTR: isize = 0; pub fn get_engine_store_server_helper_ptr() -> isize { unsafe { ENGINE_STORE_SERVER_HELPER_PTR } } -fn get_engine_store_server_helper() -> &'static EngineStoreServerHelper { +pub fn get_engine_store_server_helper() -> &'static EngineStoreServerHelper { gen_engine_store_server_helper(unsafe { ENGINE_STORE_SERVER_HELPER_PTR }) } @@ -414,6 +519,25 @@ impl EngineStoreServerHelper { } } + fn gc_raw_cpp_ptr_carr(&self, ptr: *mut ::std::os::raw::c_void, tp: RawCppPtrType, len: u64) { + debug_assert!(self.fn_gc_raw_cpp_ptr_carr.is_some()); + unsafe { + (self.fn_gc_raw_cpp_ptr_carr.into_inner())(ptr, tp, len); + } + } + + fn gc_special_raw_cpp_ptr( + &self, + ptr: *mut ::std::os::raw::c_void, + hint_len: u64, + tp: SpecialCppPtrType, + ) { + debug_assert!(self.fn_gc_special_raw_cpp_ptr.is_some()); + unsafe { + (self.fn_gc_special_raw_cpp_ptr.into_inner())(ptr, hint_len, tp); + } + } + pub fn handle_compute_store_stats(&self) -> StoreStats { debug_assert!(self.fn_handle_compute_store_stats.is_some()); unsafe { (self.fn_handle_compute_store_stats.into_inner())(self.inner) } @@ -508,6 +632,75 @@ impl EngineStoreServerHelper { } } + pub fn create_write_batch(&self) -> RawCppPtr { + debug_assert!(self.fn_create_write_batch.is_some()); + unsafe { (self.fn_create_write_batch.into_inner())(self.inner) } + } + + pub fn write_batch_put_page(&self, wb: RawVoidPtr, page_id: BaseBuffView, page: BaseBuffView) { + debug_assert!(self.fn_write_batch_put_page.is_some()); + unsafe { (self.fn_write_batch_put_page.into_inner())(wb, page_id, page) } + } + + pub fn write_batch_del_page(&self, wb: RawVoidPtr, page_id: BaseBuffView) { + debug_assert!(self.fn_write_batch_del_page.is_some()); + unsafe { (self.fn_write_batch_del_page.into_inner())(wb, page_id) } + } + + pub fn write_batch_size(&self, wb: RawVoidPtr) -> u64 { + debug_assert!(self.fn_write_batch_size.is_some()); + unsafe { (self.fn_write_batch_size.into_inner())(wb) } + } + + pub fn write_batch_is_empty(&self, wb: RawVoidPtr) -> u8 { + debug_assert!(self.fn_write_batch_is_empty.is_some()); + unsafe { (self.fn_write_batch_is_empty.into_inner())(wb) } + } + + pub fn write_batch_merge(&self, lwb: RawVoidPtr, rwb: RawVoidPtr) { + debug_assert!(self.fn_write_batch_merge.is_some()); + unsafe { (self.fn_write_batch_merge.into_inner())(lwb, rwb) } + } + + pub fn write_batch_clear(&self, wb: RawVoidPtr) { + debug_assert!(self.fn_write_batch_clear.is_some()); + unsafe { (self.fn_write_batch_clear.into_inner())(wb) } + } + + pub fn consume_write_batch(&self, wb: RawVoidPtr) { + debug_assert!(self.fn_consume_write_batch.is_some()); + unsafe { (self.fn_consume_write_batch.into_inner())(self.inner, wb) } + } + + pub fn read_page(&self, page_id: BaseBuffView) -> CppStrWithView { + debug_assert!(self.fn_handle_read_page.is_some()); + unsafe { (self.fn_handle_read_page.into_inner())(self.inner, page_id) } + } + + pub fn scan_page( + &self, + start_page_id: BaseBuffView, + end_page_id: BaseBuffView, + ) -> RawCppPtrCarr { + debug_assert!(self.fn_handle_scan_page.is_some()); + unsafe { (self.fn_handle_scan_page.into_inner())(self.inner, start_page_id, end_page_id) } + } + + pub fn purge_pagestorage(&self) { + debug_assert!(self.fn_handle_purge_pagestorage.is_some()); + unsafe { (self.fn_handle_purge_pagestorage.into_inner())(self.inner) } + } + + pub fn seek_ps_key(&self, page_id: BaseBuffView) -> CppStrWithView { + debug_assert!(self.fn_handle_seek_ps_key.is_some()); + unsafe { (self.fn_handle_seek_ps_key.into_inner())(self.inner, page_id) } + } + + pub fn is_ps_empty(&self) -> u8 { + debug_assert!(self.fn_ps_is_empty.is_some()); + unsafe { (self.fn_ps_is_empty.into_inner())(self.inner) } + } + pub fn pre_handle_snapshot( &self, region: &metapb::Region, @@ -565,7 +758,10 @@ impl EngineStoreServerHelper { } } - fn gen_cpp_string(&self, buff: &[u8]) -> RawCppStringPtr { + // Generate a cpp string, so the other side can read. + // The string is owned by the otherside, and will be deleted by + // `gc_raw_cpp_ptr`. + pub fn gen_cpp_string(&self, buff: &[u8]) -> RawCppStringPtr { debug_assert!(self.fn_gen_cpp_string.is_some()); unsafe { (self.fn_gen_cpp_string.into_inner())(buff.into()).into_raw() as RawCppStringPtr } } @@ -657,6 +853,11 @@ impl EngineStoreServerHelper { ) } } + + pub fn fast_add_peer(&self, region_id: u64, new_peer_id: u64) -> FastAddPeerRes { + debug_assert!(self.fn_fast_add_peer.is_some()); + unsafe { (self.fn_fast_add_peer.into_inner())(self.inner, region_id, new_peer_id) } + } } impl Clone for RaftStoreProxyPtr { diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 0b13f6ca274..2885b89d62e 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1,70 +1,59 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - use std::{ + collections::hash_map::Entry as MapEntry, + io::Write, ops::DerefMut, path::PathBuf, str::FromStr, sync::{atomic::Ordering, mpsc, Arc, Mutex, RwLock}, + time::SystemTime, }; use collections::HashMap; -use engine_tiflash::FsStatsExt; -use engine_traits::SstMetaInfo; +use engine_tiflash::{CachedRegionInfo, CachedRegionInfoManager}; +use engine_traits::{RaftEngine, SstMetaInfo, CF_RAFT}; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, - raft_serverpb::RaftApplyState, + raft_serverpb::{PeerState, RaftApplyState, RaftMessage, RegionLocalState}, }; -use raft::StateRole; +use protobuf::Message; +use raft::{eraftpb, eraftpb::MessageType, StateRole}; use raftstore::{ coprocessor::{ AdminObserver, ApplyCtxInfo, ApplySnapshotObserver, BoxAdminObserver, - BoxApplySnapshotObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, - BoxUpdateSafeTsObserver, Cmd, Coprocessor, CoprocessorHost, ObserverContext, - PdTaskObserver, QueryObserver, RegionChangeEvent, RegionChangeObserver, RegionState, + BoxApplySnapshotObserver, BoxMessageObserver, BoxPdTaskObserver, BoxQueryObserver, + BoxRegionChangeObserver, BoxRoleObserver, BoxUpdateSafeTsObserver, Cmd, Coprocessor, + CoprocessorHost, MessageObserver, ObserverContext, PdTaskObserver, QueryObserver, + RegionChangeEvent, RegionChangeObserver, RegionState, RoleChange, RoleObserver, StoreSizeInfo, UpdateSafeTsObserver, }, - store, - store::{check_sst_for_ingestion, snap::plain_file_used, SnapKey}, + store::{ + self, check_sst_for_ingestion, + snap::{plain_file_used, SnapEntry}, + SnapKey, SnapManager, Transport, + }, + Error as RaftStoreError, Result as RaftStoreResult, }; use sst_importer::SstImporter; -use tikv_util::{debug, error, info, warn}; +use tikv_util::{box_err, crit, debug, defer, error, info, store::find_peer, warn}; use yatp::{ pool::{Builder, ThreadPool}, task::future::TaskCell, }; use crate::{ - gen_engine_store_server_helper, - interfaces::root::{DB as ffi_interfaces, DB::EngineStoreApplyRes}, - name_to_cf, ColumnFamilyType, EngineStoreServerHelper, RaftCmdHeader, RawCppPtr, TiFlashEngine, + gen_engine_store_server_helper, interfaces::root::DB::EngineStoreApplyRes, name_to_cf, + ColumnFamilyType, EngineStoreServerHelper, RaftCmdHeader, RawCppPtr, TiFlashEngine, WriteCmdType, WriteCmds, CF_LOCK, }; -#[allow(clippy::from_over_into)] -impl Into for ffi_interfaces::StoreStats { - fn into(self) -> FsStatsExt { - FsStatsExt { - available: self.fs_stats.avail_size, - capacity: self.fs_stats.capacity_size, - used: self.fs_stats.used_size, - } - } -} - -pub struct TiFlashFFIHub { - pub engine_store_server_helper: &'static EngineStoreServerHelper, -} -unsafe impl Send for TiFlashFFIHub {} -unsafe impl Sync for TiFlashFFIHub {} -impl engine_tiflash::FFIHubInner for TiFlashFFIHub { - fn get_store_stats(&self) -> engine_tiflash::FsStatsExt { - self.engine_store_server_helper - .handle_compute_store_stats() - .into() - } +macro_rules! fatal { + ($lvl:expr $(, $arg:expr)*) => ({ + crit!($lvl $(, $arg)*); + ::std::process::exit(1) + }) } - pub struct PtrWrapper(RawCppPtr); unsafe impl Send for PtrWrapper {} @@ -90,42 +79,580 @@ impl PrehandleTask { unsafe impl Send for PrehandleTask {} unsafe impl Sync for PrehandleTask {} -pub struct TiFlashObserver { +// TiFlash observer's priority should be higher than all other observers, to +// avoid being bypassed. +const TIFLASH_OBSERVER_PRIORITY: u32 = 0; + +pub struct PackedEnvs { + pub engine_store_cfg: crate::EngineStoreConfig, + pub pd_endpoints: Vec, +} + +pub struct TiFlashObserver { pub store_id: u64, pub engine_store_server_helper: &'static EngineStoreServerHelper, pub engine: TiFlashEngine, + pub raft_engine: ER, pub sst_importer: Arc, pub pre_handle_snapshot_ctx: Arc>, pub snap_handle_pool_size: usize, pub apply_snap_pool: Option>>, pub pending_delete_ssts: Arc>>, + // TODO should we use a Mutex here? + pub trans: Arc>, + pub snap_mgr: Arc, + pub packed_envs: Arc, +} + +pub fn get_region_local_state( + engine: &EK, + region_id: u64, +) -> Option { + let region_state_key = keys::region_state_key(region_id); + engine + .get_msg_cf::(CF_RAFT, ®ion_state_key) + .unwrap_or(None) } -impl Clone for TiFlashObserver { +pub fn validate_remote_peer_region( + new_region: &kvproto::metapb::Region, + store_id: u64, + new_peer_id: u64, +) -> bool { + match find_peer(new_region, store_id) { + Some(peer) => peer.get_id() == new_peer_id, + None => false, + } +} + +impl Clone for TiFlashObserver { fn clone(&self) -> Self { TiFlashObserver { store_id: self.store_id, engine_store_server_helper: self.engine_store_server_helper, engine: self.engine.clone(), + raft_engine: self.raft_engine.clone(), sst_importer: self.sst_importer.clone(), pre_handle_snapshot_ctx: self.pre_handle_snapshot_ctx.clone(), snap_handle_pool_size: self.snap_handle_pool_size, apply_snap_pool: self.apply_snap_pool.clone(), pending_delete_ssts: self.pending_delete_ssts.clone(), + trans: self.trans.clone(), + snap_mgr: self.snap_mgr.clone(), + packed_envs: self.packed_envs.clone(), } } } -// TiFlash observer's priority should be higher than all other observers, to -// avoid being bypassed. -const TIFLASH_OBSERVER_PRIORITY: u32 = 0; +impl TiFlashObserver { + pub fn is_initialized(&self, region_id: u64) -> bool { + match get_region_local_state(&self.engine, region_id) { + None => false, + Some(r) => { + raftstore::store::util::is_region_initialized(r.get_region()) + && (r.get_state() != PeerState::Tombstone) + } + } + } + + pub fn get_cached_manager(&self) -> Arc { + self.engine + .cached_region_info_manager + .as_ref() + .unwrap() + .clone() + } + + // Returns whether we need to ignore this message and run fast path instead. + pub fn maybe_fast_path(&self, msg: &RaftMessage) -> bool { + if !self.packed_envs.engine_store_cfg.enable_fast_add_peer { + // fast path not enabled + return false; + } + let inner_msg = msg.get_message(); + if inner_msg.get_commit() == 0 && inner_msg.get_msg_type() == MessageType::MsgHeartbeat { + return false; + } else if inner_msg.get_msg_type() == MessageType::MsgAppend { + } else { + return false; + } + // TODO We don't need to recover all region infomation from restart, + // since we have `has_already_inited`. + let inner_msg = msg.get_message(); + if inner_msg.get_msg_type() != MessageType::MsgAppend { + // we only handles the first MsgAppend + return false; + } + let region_id = msg.get_region_id(); + let new_peer_id = msg.get_to_peer().get_id(); + let cached_manager = self.get_cached_manager(); + let mut is_first = false; + let mut is_replicated = false; + let mut has_already_inited = None; + let mut early_skip = false; + let f = |info: MapEntry>| { + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + match info { + MapEntry::Occupied(mut o) => { + // Test if a fast path is timeout + let fast_path_start = o.get().fast_add_peer_start.load(Ordering::SeqCst); + if fast_path_start != 0 { + let elapsed = current.as_millis() - fast_path_start; + #[cfg(any(test, feature = "testexport"))] + const TRACE_SLOW_MILLIS: u128 = 0; + #[cfg(any(test, feature = "testexport"))] + const FALLBACK_MILLIS: u128 = 1000 * 2; + #[cfg(not(any(test, feature = "testexport")))] + const TRACE_SLOW_MILLIS: u128 = 1000 * 60 * 3; + #[cfg(not(any(test, feature = "testexport")))] + const FALLBACK_MILLIS: u128 = 1000 * 60 * 5; + if elapsed >= TRACE_SLOW_MILLIS { + let need_fallback = elapsed > FALLBACK_MILLIS; + // TODO If snapshot is sent, we need fallback but can't do fallback? + let do_fallback = need_fallback; + info!("fast path: ongoing {}:{} {}, MsgAppend duplicated", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + "elapsed" => elapsed, + "do_fallback" => do_fallback, + ); + if do_fallback { + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + is_first = false; + early_skip = false; + return; + } + } + } + // If a snapshot is sent, we must skip further handling. + let last = o.get().snapshot_inflight.load(Ordering::SeqCst); + if last != 0 { + early_skip = true; + // We must return here to avoid changing `inited_or_fallback`. + // Otherwise will cause different value in pre/post_apply_snapshot. + return; + } + (is_first, has_already_inited) = + if !o.get().inited_or_fallback.load(Ordering::SeqCst) { + // If `has_already_inited` is true: + // 1. We recover from a restart, + // 2. The peer is created by TiKV like split; + // So we have data in disk, but not in memory. + // In these cases, we need to check everytime. + + // TODO We can then remove logics in apply snapshot. + // This is because if the next maybe_fast_path after apply snapshot + // will have has_already_inited == true, which leads to normal + // MsgAppend. + let has_already_inited = self.is_initialized(region_id); + if has_already_inited { + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + } + (!has_already_inited, Some(has_already_inited)) + } else { + (false, None) + }; + if is_first { + // Don't care if the exchange succeeds. + let _ = o.get_mut().fast_add_peer_start.compare_exchange( + 0, + current.as_millis(), + Ordering::SeqCst, + Ordering::SeqCst, + ); + } + // TODO include create + is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); + } + MapEntry::Vacant(v) => { + info!("fast path: ongoing {}:{} {}, first message", self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + "inner_msg" => ?inner_msg, + ); + let c = CachedRegionInfo::default(); + c.fast_add_peer_start + .store(current.as_millis(), Ordering::SeqCst); + v.insert(Arc::new(c)); + is_first = true; + } + } + }; + + // Try not acquire write lock firstly. + match cached_manager.get_inited_or_fallback(region_id) { + Some(true) => { + // Most cases, when the peer is already inited. + is_first = false; + } + None | Some(false) => self + .get_cached_manager() + .access_cached_region_info_mut(region_id, f) + .unwrap(), + }; + + #[cfg(any(test, feature = "testexport"))] + { + if is_first { + info!("fast path: ongoing {}:{} {}, MsgAppend skipped", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + ); + } else { + info!("fast path: ongoing {}:{} {}, MsgAppend accepted", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + ); + } + } + + // If early_skip is true, we don't read the value of `is_first`. + if early_skip { + return true; + } + + if !is_first { + // Most cases, the region is already inited or fallback. + // Skip fast add peer. + return false; + } + + // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, + // see `check_msg`. + if !is_replicated { + info!("fast path: ongoing {}:{} {}, wait replicating peer", self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + "inner_msg" => ?inner_msg, + ); + return true; + } + + info!("fast path: ongoing {}:{} {}, fetch data from remote peer", self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + ); + fail::fail_point!("ffi_fast_add_peer_pause", |_| { return false }); + // Feed data + let res = self + .engine_store_server_helper + .fast_add_peer(region_id, new_peer_id); + match res.status { + crate::FastAddPeerStatus::Ok => (), + crate::FastAddPeerStatus::WaitForData => { + info!( + "fast path: ongoing {}:{} {}. remote peer preparing data, wait", + self.store_id, region_id, new_peer_id; + "region_id" => region_id, + ); + return true; + } + _ => { + error!( + "fast path: ongoing {}:{} {} failed. fetch and replace error {:?}, fallback to normal", + self.store_id, region_id, new_peer_id, res; + "region_id" => region_id, + ); + cached_manager.fallback_to_slow_path(region_id); + return false; + } + }; + + let apply_state_str = res.apply_state.view.to_slice(); + let region_str = res.region.view.to_slice(); + let mut apply_state = RaftApplyState::default(); + let mut new_region = kvproto::metapb::Region::default(); + if let Err(_e) = apply_state.merge_from_bytes(apply_state_str) { + error!( + "fast path: ongoing {}:{} {} failed. parse apply_state {:?}, fallback to normal", + self.store_id, region_id, new_peer_id, res; + "region_id" => region_id, + ); + cached_manager.fallback_to_slow_path(region_id); + } + if let Err(_e) = new_region.merge_from_bytes(region_str) { + error!( + "fast path: ongoing {}:{} {} failed. parse region {:?}, fallback to normal", + self.store_id, region_id, new_peer_id, res; + "region_id" => region_id, + ); + cached_manager.fallback_to_slow_path(region_id); + } + + // Validate + // check if the source already knows the know peer + if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { + info!( + "fast path: ongoing {}:{} {}. failed remote peer has not applied conf change", + self.store_id, region_id, new_peer_id; + "region_id" => region_id, + "region" => ?new_region, + ); + cached_manager.fallback_to_slow_path(region_id); + return false; + } + + info!("fast path: ongoing {}:{} {}, start build and send", self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + "new_region" => ?new_region, + "apply_state" => ?apply_state, + ); + match self.build_and_send_snapshot(region_id, new_peer_id, msg, apply_state, new_region) { + Ok(s) => { + match s { + crate::FastAddPeerStatus::Ok => { + fail::fail_point!("go_fast_path_not_allow", |_| { return false }); + info!("fast path: ongoing {}:{} {}, finish build and send", self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + ); + } + crate::FastAddPeerStatus::WaitForData => { + info!( + "fast path: ongoing {}:{} {}. remote peer preparing data, wait", + new_peer_id, self.store_id, region_id; + "region_id" => region_id, + ); + return true; + } + _ => { + error!( + "fast path: ongoing {}:{} {} failed. build and sent snapshot code {:?}", + self.store_id, region_id, new_peer_id, s; + "region_id" => region_id, + ); + cached_manager.fallback_to_slow_path(region_id); + return false; + } + }; + } + Err(e) => { + error!( + "fast path: ongoing {}:{} {} failed. build and sent snapshot error {:?}", + self.store_id, region_id, new_peer_id, e; + "region_id" => region_id, + ); + cached_manager.fallback_to_slow_path(region_id); + return false; + } + }; + is_first + } + + fn check_entry_at_index( + &self, + region_id: u64, + index: u64, + peer_id: u64, + tag: &str, + ) -> RaftStoreResult { + match self.raft_engine.get_entry(region_id, index)? { + Some(entry) => Ok(entry.get_term()), + None => { + return Err(box_err!( + "can't find entry for index {} of region {}, peer_id: {}, tag {}", + index, + region_id, + peer_id, + tag + )); + } + } + } + + fn build_and_send_snapshot( + &self, + region_id: u64, + new_peer_id: u64, + msg: &RaftMessage, + apply_state: RaftApplyState, + new_region: kvproto::metapb::Region, + ) -> RaftStoreResult { + let cached_manager = self.get_cached_manager(); + let inner_msg = msg.get_message(); + + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + #[cfg(any(test, feature = "testexport"))] + { + let fake_send: bool = (|| { + fail::fail_point!("fast_add_peer_fake_send", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })() != 0; + if fake_send { + // A handling snapshot may block handling later MsgAppend. + // So we fake send. + cached_manager + .set_snapshot_inflight(region_id, current.as_millis()) + .unwrap(); + return Ok(crate::FastAddPeerStatus::Ok); + } + } + + // Get a snapshot object. + let (mut snapshot, key) = { + // Find term of entry at applied_index. + let applied_index = apply_state.get_applied_index(); + let applied_term = + self.check_entry_at_index(region_id, applied_index, new_peer_id, "applied_index")?; + // Will otherwise cause "got message with lower index than committed" loop. + self.check_entry_at_index( + region_id, + apply_state.get_commit_index(), + new_peer_id, + "commit_index", + )?; + + let key = SnapKey::new(region_id, applied_term, applied_index); + self.snap_mgr.register(key.clone(), SnapEntry::Generating); + defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); + let snapshot = self.snap_mgr.get_snapshot_for_building(&key)?; + for cf in snapshot.cf_files().iter() { + info!("!!!! snapshot cf_file of {} size {:?}", cf.cf, cf.size); + } + + (snapshot, key.clone()) + }; + + // Build snapshot by do_snapshot + let mut pb_snapshot: eraftpb::Snapshot = Default::default(); + let pb_snapshot_metadata: &mut eraftpb::SnapshotMetadata = pb_snapshot.mut_metadata(); + let mut pb_snapshot_data = kvproto::raft_serverpb::RaftSnapshotData::default(); + { + // eraftpb::SnapshotMetadata + for (_, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { + let cf_index: RaftStoreResult = snapshot + .cf_files() + .iter() + .position(|x| &x.cf == cf) + .ok_or(box_err!("can't find index for cf {}", cf)); + let cf_index = cf_index?; + let cf_file = &snapshot.cf_files()[cf_index]; + // Create fake cf file. + let mut path = cf_file.path.clone(); + path.push(cf_file.file_prefix.clone()); + path.set_extension("sst"); + let mut f = std::fs::File::create(path.as_path())?; + f.flush()?; + f.sync_all()?; + } + pb_snapshot_data.set_region(new_region.clone()); + pb_snapshot_data.set_file_size(0); + const SNAPSHOT_VERSION: u64 = 2; + pb_snapshot_data.set_version(SNAPSHOT_VERSION); + + // SnapshotMeta + // Which is snap.meta_file.meta + let snapshot_meta = + raftstore::store::snap::gen_snapshot_meta(snapshot.cf_files(), true)?; + + // Write MetaFile + { + // let v = snapshot_meta.write_to_bytes()?; + // let mut f = std::fs::File::create(snapshot.meta_path())?; + // info!("!!!!! create snapshot meta file {:?}", snapshot.meta_path()); + // f.write_all(&v[..])?; + // f.flush()?; + // f.sync_all()?; + // snapshot.mut_meta_file().meta.insert(snapshot_meta.clone()); + snapshot.set_snapshot_meta(snapshot_meta.clone())?; + // snapshot.set_hold_tmp_files(false); + snapshot.save_meta_file()?; + } + pb_snapshot_data.set_meta(snapshot_meta); + } + + pb_snapshot_metadata + .set_conf_state(raftstore::store::util::conf_state_from_region(&new_region)); + pb_snapshot_metadata.set_index(key.idx); + pb_snapshot_metadata.set_term(key.term); + + pb_snapshot.set_data(pb_snapshot_data.write_to_bytes().unwrap().into()); + + // Send reponse + let mut response = RaftMessage::default(); + let epoch = new_region.get_region_epoch(); + response.set_region_epoch(epoch.clone()); + response.set_region_id(region_id); + response.set_from_peer(msg.get_from_peer().clone()); + response.set_to_peer(msg.get_to_peer().clone()); + + let message = response.mut_message(); + message.set_msg_type(MessageType::MsgSnapshot); + message.set_term(inner_msg.get_term()); + message.set_snapshot(pb_snapshot); + // If no set, will result in a MsgResponse to peer 0. + message.set_from(msg.get_from_peer().get_id()); + message.set_to(msg.get_to_peer().get_id()); + debug!( + "!!!! send snapshot to {} key {} raft message {:?} snap data {:?} apply_state {:?}", + msg.get_to_peer().get_id(), + key, + response, + pb_snapshot_data, + apply_state + ); + + match self.trans.lock() { + Ok(mut trans) => match trans.send(response) { + Ok(_) => { + cached_manager + .set_snapshot_inflight(region_id, current.as_millis()) + .unwrap(); + // If we don't flush here, packet will lost. + trans.flush(); + } + Err(RaftStoreError::RegionNotFound(_)) => (), + _ => return Ok(crate::FastAddPeerStatus::OtherError), + }, + Err(e) => return Err(box_err!("send snapshot meets error {:?}", e)), + } + + Ok(crate::FastAddPeerStatus::Ok) + } +} -impl TiFlashObserver { +impl TiFlashObserver { + #[allow(clippy::too_many_arguments)] pub fn new( store_id: u64, engine: engine_tiflash::RocksEngine, + raft_engine: ER, sst_importer: Arc, snap_handle_pool_size: usize, + trans: T, + snap_mgr: SnapManager, + packed_envs: PackedEnvs, ) -> Self { let engine_store_server_helper = gen_engine_store_server_helper(engine.engine_store_server_helper); @@ -133,15 +660,20 @@ impl TiFlashObserver { let snap_pool = Builder::new(tikv_util::thd_name!("region-task")) .max_thread_count(snap_handle_pool_size) .build_future_pool(); + TiFlashObserver { store_id, engine_store_server_helper, engine, + raft_engine, sst_importer, pre_handle_snapshot_ctx: Arc::new(Mutex::new(PrehandleContext::default())), snap_handle_pool_size, apply_snap_pool: Some(Arc::new(snap_pool)), pending_delete_ssts: Arc::new(RwLock::new(vec![])), + trans: Arc::new(Mutex::new(trans)), + snap_mgr: Arc::new(snap_mgr), + packed_envs: Arc::new(packed_envs), } } @@ -174,6 +706,14 @@ impl TiFlashObserver { TIFLASH_OBSERVER_PRIORITY, BoxUpdateSafeTsObserver::new(self.clone()), ); + coprocessor_host.registry.register_role_observer( + TIFLASH_OBSERVER_PRIORITY, + BoxRoleObserver::new(self.clone()), + ); + coprocessor_host.registry.register_message_observer( + TIFLASH_OBSERVER_PRIORITY, + BoxMessageObserver::new(self.clone()), + ); } fn handle_ingest_sst_for_engine_store( @@ -245,14 +785,14 @@ impl TiFlashObserver { } } -impl Coprocessor for TiFlashObserver { +impl Coprocessor for TiFlashObserver { fn stop(&self) { info!("shutdown tiflash observer"; "store_id" => self.store_id); self.apply_snap_pool.as_ref().unwrap().shutdown(); } } -impl AdminObserver for TiFlashObserver { +impl AdminObserver for TiFlashObserver { fn pre_exec_admin( &self, ob_ctx: &mut ObserverContext<'_>, @@ -415,7 +955,7 @@ impl AdminObserver for TiFlashObserver { } } -impl QueryObserver for TiFlashObserver { +impl QueryObserver for TiFlashObserver { fn on_empty_cmd(&self, ob_ctx: &mut ObserverContext<'_>, index: u64, term: u64) { fail::fail_point!("on_empty_cmd_normal", |_| {}); debug!("encounter empty cmd, maybe due to leadership change"; @@ -583,7 +1123,7 @@ impl QueryObserver for TiFlashObserver { } } -impl UpdateSafeTsObserver for TiFlashObserver { +impl UpdateSafeTsObserver for TiFlashObserver { fn on_update_safe_ts(&self, region_id: u64, self_safe_ts: u64, leader_safe_ts: u64) { self.engine_store_server_helper.handle_safe_ts_update( region_id, @@ -593,21 +1133,26 @@ impl UpdateSafeTsObserver for TiFlashObserver { } } -impl RegionChangeObserver for TiFlashObserver { +impl RegionChangeObserver for TiFlashObserver { fn on_region_changed( &self, ob_ctx: &mut ObserverContext<'_>, e: RegionChangeEvent, _: StateRole, ) { + let region_id = ob_ctx.region().get_id(); if e == RegionChangeEvent::Destroy { info!( "observe destroy"; - "region_id" => ob_ctx.region().get_id(), + "region_id" => region_id, "store_id" => self.store_id, ); self.engine_store_server_helper .handle_destroy(ob_ctx.region().get_id()); + if self.packed_envs.engine_store_cfg.enable_fast_add_peer { + self.get_cached_manager() + .remove_cached_region_info(region_id); + } } } @@ -650,12 +1195,22 @@ impl RegionChangeObserver for TiFlashObserver { } fn pre_write_apply_state(&self, _ob_ctx: &mut ObserverContext<'_>) -> bool { - fail::fail_point!("on_pre_persist_with_finish", |_| { true }); + fail::fail_point!("on_pre_persist_with_finish", |_| { + // Some test need persist apply state for Leader logic, + // including fast add peer. + true + }); false } } -impl PdTaskObserver for TiFlashObserver { +impl MessageObserver for TiFlashObserver { + fn on_raft_message(&self, msg: &RaftMessage) -> bool { + !self.maybe_fast_path(&msg) + } +} + +impl PdTaskObserver for TiFlashObserver { fn on_compute_engine_size(&self, store_size: &mut Option) { let stats = self.engine_store_server_helper.handle_compute_store_stats(); let _ = store_size.insert(StoreSizeInfo { @@ -726,7 +1281,7 @@ fn pre_handle_snapshot_impl( PtrWrapper(ptr) } -impl ApplySnapshotObserver for TiFlashObserver { +impl ApplySnapshotObserver for TiFlashObserver { #[allow(clippy::single_match)] fn pre_apply_snapshot( &self, @@ -735,10 +1290,12 @@ impl ApplySnapshotObserver for TiFlashObserver { snap_key: &store::SnapKey, snap: Option<&store::Snapshot>, ) { + let region_id = ob_ctx.region().get_id(); info!("pre apply snapshot"; "peer_id" => peer_id, - "region_id" => ob_ctx.region().get_id(), + "region_id" => region_id, "snap_key" => ?snap_key, + "has_snap" => snap.is_some(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); fail::fail_point!("on_ob_pre_handle_snapshot", |_| {}); @@ -757,20 +1314,55 @@ impl ApplySnapshotObserver for TiFlashObserver { return; }); - let (sender, receiver) = mpsc::channel(); - let task = Arc::new(PrehandleTask::new(receiver, peer_id)); - { - let mut lock = self.pre_handle_snapshot_ctx.lock().unwrap(); - let ctx = lock.deref_mut(); - ctx.tracer.insert(snap_key.clone(), task.clone()); + let mut should_skip = false; + #[allow(clippy::collapsible_if)] + if self.packed_envs.engine_store_cfg.enable_fast_add_peer { + if self.get_cached_manager().access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(o) => { + let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); + if is_first_snapsot { + info!("fast path: prehandle first snapshot {}:{} {}", self.store_id, region_id, peer_id; + "snap_key" => ?snap_key, + "region_id" => region_id, + ); + should_skip = true; + } + } + MapEntry::Vacant(_) => { + // Compat no fast add peer logic + // panic!("unknown snapshot!"); + } + }, + ).is_err() { + fatal!("post_apply_snapshot poisoned") + }; + } + + if should_skip { + return; } - let engine_store_server_helper = self.engine_store_server_helper; - let region = ob_ctx.region().clone(); - let snap_key = snap_key.clone(); - let ssts = retrieve_sst_files(snap); match self.apply_snap_pool.as_ref() { Some(p) => { + let (sender, receiver) = mpsc::channel(); + let task = Arc::new(PrehandleTask::new(receiver, peer_id)); + { + let mut lock = match self.pre_handle_snapshot_ctx.lock() { + Ok(l) => l, + Err(_) => fatal!("pre_apply_snapshot poisoned"), + }; + let ctx = lock.deref_mut(); + ctx.tracer.insert(snap_key.clone(), task.clone()); + } + + let engine_store_server_helper = self.engine_store_server_helper; + let region = ob_ctx.region().clone(); + let snap_key = snap_key.clone(); + let ssts = retrieve_sst_files(snap); + + // We use thread pool to do pre handling. self.engine .pending_applies_count .fetch_add(1, Ordering::SeqCst); @@ -786,7 +1378,13 @@ impl ApplySnapshotObserver for TiFlashObserver { &snap_key, ); match sender.send(res) { - Err(_e) => error!("pre apply snapshot err when send to receiver"), + Err(_e) => { + error!("pre apply snapshot err when send to receiver"; + "region_id" => region.get_id(), + "peer_id" => task.peer_id, + "snap_key" => ?snap_key, + ) + } Ok(_) => (), } }); @@ -811,21 +1409,68 @@ impl ApplySnapshotObserver for TiFlashObserver { fail::fail_point!("on_ob_post_apply_snapshot", |_| { return; }); + let region_id = ob_ctx.region().get_id(); info!("post apply snapshot"; "peer_id" => ?peer_id, "snap_key" => ?snap_key, + "region_id" => region_id, "region" => ?ob_ctx.region(), + "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); + let mut should_skip = false; + #[allow(clippy::collapsible_if)] + if self.packed_envs.engine_store_cfg.enable_fast_add_peer { + if self.get_cached_manager().access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); + if is_first_snapsot { + let last = o.get().snapshot_inflight.load(Ordering::SeqCst); + let total = o.get().fast_add_peer_start.load(Ordering::SeqCst); + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + info!("fast path: applied first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; + "snap_key" => ?snap_key, + "region_id" => region_id, + "cost_snapshot" => current.as_millis() - last, + "cost_total" => current.as_millis() - total, + ); + should_skip = true; + o.get_mut().snapshot_inflight.store(0, Ordering::SeqCst); + o.get_mut().fast_add_peer_start.store(0, Ordering::SeqCst); + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + } + } + MapEntry::Vacant(_) => { + // Compat no fast add peer logic + // panic!("unknown snapshot!"); + } + }, + ).is_err() { + fatal!("post_apply_snapshot poisoned") + }; + } + + if should_skip { + return; + } + let snap = match snap { None => return, Some(s) => s, }; - let maybe_snapshot = { - let mut lock = self.pre_handle_snapshot_ctx.lock().unwrap(); + let maybe_prehandle_task = { + let mut lock = match self.pre_handle_snapshot_ctx.lock() { + Ok(l) => l, + Err(_) => fatal!("post_apply_snapshot poisoned"), + }; let ctx = lock.deref_mut(); ctx.tracer.remove(snap_key) }; - let need_retry = match maybe_snapshot { + + let need_retry = match maybe_prehandle_task { Some(t) => { let neer_retry = match t.recv.recv() { Ok(snap_ptr) => { @@ -835,8 +1480,10 @@ impl ApplySnapshotObserver for TiFlashObserver { "region_id" => ob_ctx.region().get_id(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); - self.engine_store_server_helper - .apply_pre_handled_snapshot(snap_ptr.0); + if !should_skip { + self.engine_store_server_helper + .apply_pre_handled_snapshot(snap_ptr.0); + } false } Err(_) => { @@ -849,9 +1496,16 @@ impl ApplySnapshotObserver for TiFlashObserver { true } }; - self.engine + // According to pre_apply_snapshot, if registered tracer, + // then we must have put it into thread pool. + let _prev = self + .engine .pending_applies_count .fetch_sub(1, Ordering::SeqCst); + + #[cfg(any(test, feature = "testexport"))] + assert!(_prev > 0); + info!("apply snapshot finished"; "peer_id" => peer_id, "snap_key" => ?snap_key, @@ -873,7 +1527,9 @@ impl ApplySnapshotObserver for TiFlashObserver { true } }; - if need_retry { + + if need_retry && !should_skip { + // Blocking pre handle. let ssts = retrieve_sst_files(snap); let ptr = pre_handle_snapshot_impl( self.engine_store_server_helper, @@ -902,3 +1558,42 @@ impl ApplySnapshotObserver for TiFlashObserver { true } } + +impl RoleObserver for TiFlashObserver { + fn on_role_change(&self, ctx: &mut ObserverContext<'_>, r: &RoleChange) { + let region_id = ctx.region().get_id(); + let is_replicated = !r.initialized; + let f = |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + // Note the region info may be registered by maybe_fast_path + info!("fast path: ongoing {}:{} {}, peer created", + self.store_id, region_id, 0; + "region_id" => region_id, + "is_replicated" => is_replicated, + ); + if is_replicated { + o.get_mut() + .replicated_or_created + .store(true, Ordering::SeqCst); + } + } + MapEntry::Vacant(v) => { + // TODO support peer_id + info!("fast path: ongoing {}:{} {}, peer created", + self.store_id, region_id, 0; + "region_id" => region_id, + "is_replicated" => is_replicated, + ); + if is_replicated { + let c = CachedRegionInfo::default(); + c.replicated_or_created.store(true, Ordering::SeqCst); + v.insert(Arc::new(c)); + } + } + }; + // TODO remove unwrap + self.get_cached_manager() + .access_cached_region_info_mut(region_id, f) + .unwrap(); + } +} diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs new file mode 100644 index 00000000000..8efc061b35f --- /dev/null +++ b/engine_store_ffi/src/ps_engine.rs @@ -0,0 +1,612 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +// Disable warnings for unused engine_rocks's feature. +#![allow(dead_code)] +#![allow(unused_variables)] + +use std::{ + fmt::{Debug, Formatter}, + slice, +}; + +use engine_traits::{ + Error, PerfContext, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, RaftEngineDebug, + RaftEngineReadOnly, RaftLogBatch, Result, +}; +use kvproto::{ + metapb::Region, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, +}; +use protobuf::Message; +use raft::eraftpb::Entry; +use tikv_util::{box_try, info}; +use tracker::TrackerToken; + +use crate::{gen_engine_store_server_helper, PageAndCppStrWithView, RawCppPtr}; + +// 1. STORE_IDENT 0 +// 2. PREPARE_BOOTSTRAP 1 +// 3. RaftLocalState 2 +// 4. RegionLocalState 3 +// 5. RaftApplyState 4 +// 6. Snapshot RaftLocalState 5 +// 7. Reserved 6..9 +// 8. Log 10(+ offset 5) + +// pub const PS_KEY_PREFIX: &[u8] = &[b'r', b'_']; +// pub const PS_KEY_SEP: u8 = b'_'; +// +// const RAFT_LOCAL_STATE_ID : u64 = 2; +// const RAFT_LOG_ID_OFFSET : u64 = 5; +// +// pub fn ps_raft_state_key(region_id: u64) -> [u8; 19] { +// let mut key = [0; 19]; +// key[..2].copy_from_slice(PS_KEY_PREFIX); +// BigEndian::write_u64(&mut key[2..10], region_id); +// key[10] = PS_KEY_SEP; +// BigEndian::write_u64(&mut key[11..19], RAFT_LOCAL_STATE_ID); +// key +// } +// +// pub fn ps_raft_log_key(region_id: u64, log_index: u64) -> [u8; 19] { +// let mut key = [0; 19]; +// key[..2].copy_from_slice(PS_KEY_PREFIX); +// BigEndian::write_u64(&mut key[2..10], region_id); +// key[10] = PS_KEY_SEP; +// BigEndian::write_u64(&mut key[11..19], log_index + RAFT_LOG_ID_OFFSET); +// key +// } +// +// pub fn ps_raft_log_prefix(region_id: u64) -> [u8; 11] { +// let mut key = [0; 11]; +// key[..2].copy_from_slice(PS_KEY_PREFIX); +// BigEndian::write_u64(&mut key[2..10], region_id); +// key[10] = PS_KEY_SEP; +// key +// } +// +// pub fn ps_raft_log_index(key: &[u8]) -> u64 { +// let expect_key_len = PS_KEY_PREFIX.len() +// + mem::size_of::() +// + mem::size_of::() +// + mem::size_of::(); +// if key.len() != expect_key_len { +// panic!("wrong key format {:?}", key); +// } +// BigEndian::read_u64( +// &key[expect_key_len - mem::size_of::()..], +// ) +// } + +pub struct PSEngineWriteBatch { + pub engine_store_server_helper: isize, + pub raw_write_batch: RawCppPtr, +} + +impl PSEngineWriteBatch { + pub fn new(engine_store_server_helper: isize) -> PSEngineWriteBatch { + let helper = gen_engine_store_server_helper(engine_store_server_helper); + let raw_write_batch = helper.create_write_batch(); + PSEngineWriteBatch { + engine_store_server_helper, + raw_write_batch, + } + } + + fn put_page(&mut self, page_id: &[u8], value: &[u8]) -> Result<()> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_put_page(self.raw_write_batch.ptr, page_id.into(), value.into()); + Ok(()) + } + + fn del_page(&mut self, page_id: &[u8]) -> Result<()> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_del_page(self.raw_write_batch.ptr, page_id.into()); + Ok(()) + } + + fn append_impl( + &mut self, + raft_group_id: u64, + entries: &[Entry], + mut ser_buf: Vec, + ) -> Result<()> { + for entry in entries { + ser_buf.clear(); + entry.write_to_vec(&mut ser_buf).unwrap(); + let key = keys::raft_log_key(raft_group_id, entry.get_index()); + self.put_page(&key, &ser_buf)?; + } + Ok(()) + } + + fn put_msg(&mut self, page_id: &[u8], m: &M) -> Result<()> { + self.put_page(page_id, &m.write_to_bytes()?) + } + + fn data_size(&self) -> usize { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_size(self.raw_write_batch.ptr) as usize + } + + fn clear(&self) { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_clear(self.raw_write_batch.ptr); + } +} + +impl RaftLogBatch for PSEngineWriteBatch { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + let overwrite_to = overwrite_to.unwrap_or(0); + if let Some(last) = entries.last() && last.get_index() + 1 < overwrite_to { + // TODO + panic!("PSEngineWriteBatch has no delete method !!!!!"); + // for index in last.get_index() + 1..overwrite_to { + // let key = keys::raft_log_key(raft_group_id, index); + // self.delete(&key).unwrap(); + // } + } + if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { + let ser_buf = Vec::with_capacity(max_size as usize); + return self.append_impl(raft_group_id, &entries, ser_buf); + } + Ok(()) + } + + fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { + self.put_msg(&keys::raft_state_key(raft_group_id), state) + } + + fn persist_size(&self) -> usize { + self.data_size() + } + + fn is_empty(&self) -> bool { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_is_empty(self.raw_write_batch.ptr) != 0 + } + + fn merge(&mut self, src: Self) -> Result<()> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_merge(self.raw_write_batch.ptr, src.raw_write_batch.ptr); + Ok(()) + } + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { + self.put_msg(keys::STORE_IDENT_KEY, ident) + } + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { + self.put_msg(keys::PREPARE_BOOTSTRAP_KEY, region) + } + + fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { + self.del_page(keys::PREPARE_BOOTSTRAP_KEY) + } + + fn put_region_state( + &mut self, + raft_group_id: u64, + _apply_index: u64, + state: &RegionLocalState, + ) -> Result<()> { + self.put_msg(&keys::region_state_key(raft_group_id), state) + } + + fn put_apply_state( + &mut self, + raft_group_id: u64, + _apply_index: u64, + state: &RaftApplyState, + ) -> Result<()> { + self.put_msg(&keys::apply_state_key(raft_group_id), state) + } + + fn put_flushed_index( + &mut self, + _raft_group_id: u64, + _cf: &str, + _tablet_index: u64, + _apply_index: u64, + ) -> Result<()> { + panic!() + } + + fn put_dirty_mark( + &mut self, + _raft_group_id: u64, + _tablet_index: u64, + _dirty: bool, + ) -> Result<()> { + panic!() + } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + self.put_msg(keys::RECOVER_STATE_KEY, state) + } +} + +#[derive(Clone, Default)] +pub struct PSEngine { + pub engine_store_server_helper: isize, +} + +impl std::fmt::Debug for PSEngine { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PSEngine") + .field( + "engine_store_server_helper", + &self.engine_store_server_helper, + ) + .finish() + } +} + +impl PSEngine { + pub fn new() -> Self { + PSEngine { + engine_store_server_helper: 0, + } + } + + pub fn init(&mut self, engine_store_server_helper: isize) { + self.engine_store_server_helper = engine_store_server_helper; + } + + fn get_msg_cf(&self, page_id: &[u8]) -> Result> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + let value = helper.read_page(page_id.into()); + if value.view.len == 0 { + return Ok(None); + } + + let mut m = M::default(); + m.merge_from_bytes(unsafe { + slice::from_raw_parts(value.view.data as *const u8, value.view.len as usize) + })?; + Ok(Some(m)) + } + + fn get_value(&self, page_id: &[u8]) -> Option> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + let value = helper.read_page(page_id.into()); + return if value.view.len == 0 { + None + } else { + Some(value.view.to_slice().to_vec()) + }; + } + + // Seek the first key >= given key, if not found, return None. + fn seek(&self, key: &[u8]) -> Option> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + let target_key = helper.seek_ps_key(key.into()); + if target_key.view.len == 0 { + None + } else { + Some(target_key.view.to_slice().to_vec()) + } + } + + /// scan the key between start_key(inclusive) and end_key(exclusive), + /// the upper bound is omitted if end_key is empty + fn scan(&self, start_key: &[u8], end_key: &[u8], mut f: F) -> Result<()> + where + F: FnMut(&[u8], &[u8]) -> Result, + { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + let values = helper.scan_page(start_key.into(), end_key.into()); + let arr = values.inner as *mut PageAndCppStrWithView; + for i in 0..values.len { + let value = unsafe { &*arr.offset(i as isize) }; + if value.page_view.len != 0 + && !f(value.key_view.to_slice(), value.page_view.to_slice())? + { + break; + } + } + Ok(()) + } + + fn gc_impl( + &self, + raft_group_id: u64, + mut from: u64, + to: u64, + raft_wb: &mut PSEngineWriteBatch, + ) -> Result { + if from == 0 { + let start_key = keys::raft_log_key(raft_group_id, 0); + let prefix = keys::raft_log_prefix(raft_group_id); + // TODO: make sure the seek can skip other raft related key and to the first log + // key + match self.seek(&start_key) { + Some(target_key) if target_key.starts_with(&prefix) => { + from = box_try!(keys::raft_log_index(&target_key)) + } + // No need to gc. + _ => return Ok(0), + } + } + if from >= to { + return Ok(0); + } + + let mut raft_wb = self.log_batch(0); + for idx in from..to { + raft_wb.del_page(&keys::raft_log_key(raft_group_id, idx))?; + } + // TODO: keep the max size of raft_wb under some threshold + self.consume(&mut raft_wb, false)?; + Ok((to - from) as usize) + } + + fn is_empty(&self) -> bool { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.is_ps_empty() != 0 + } +} + +impl RaftEngineReadOnly for PSEngine { + fn get_raft_state(&self, raft_group_id: u64) -> Result> { + let key = keys::raft_state_key(raft_group_id); + self.get_msg_cf(&key) + } + + fn get_entry(&self, raft_group_id: u64, index: u64) -> Result> { + let key = keys::raft_log_key(raft_group_id, index); + self.get_msg_cf(&key) + } + + fn fetch_entries_to( + &self, + region_id: u64, + low: u64, + high: u64, + max_size: Option, + buf: &mut Vec, + ) -> Result { + let (max_size, mut total_size, mut count) = (max_size.unwrap_or(usize::MAX), 0, 0); + + let start_key = keys::raft_log_key(region_id, low); + let end_key = keys::raft_log_key(region_id, high); + + self.scan(&start_key, &end_key, |_, page| { + let mut entry = Entry::default(); + entry.merge_from_bytes(page)?; + buf.push(entry); + total_size += page.len(); + count += 1; + Ok(total_size < max_size) + })?; + + Ok(count) + } + + fn get_all_entries_to(&self, region_id: u64, buf: &mut Vec) -> Result<()> { + let start_key = keys::raft_log_key(region_id, 0); + let end_key = keys::raft_log_key(region_id, u64::MAX); + self.scan(&start_key, &end_key, |_, page| { + let mut entry = Entry::default(); + entry.merge_from_bytes(page)?; + buf.push(entry); + Ok(true) + })?; + Ok(()) + } + + fn is_empty(&self) -> Result { + Ok(self.is_empty()) + } + + fn get_store_ident(&self) -> Result> { + self.get_msg_cf(keys::STORE_IDENT_KEY) + } + + fn get_prepare_bootstrap_region(&self) -> Result> { + self.get_msg_cf(keys::PREPARE_BOOTSTRAP_KEY) + } + + fn get_region_state( + &self, + _apply_index: u64, + raft_group_id: u64, + ) -> Result> { + let key = keys::region_state_key(raft_group_id); + self.get_msg_cf(&key) + } + + fn get_apply_state( + &self, + _apply_index: u64, + raft_group_id: u64, + ) -> Result> { + let key = keys::apply_state_key(raft_group_id); + self.get_msg_cf(&key) + } + + fn get_recover_state(&self) -> Result> { + self.get_msg_cf(keys::RECOVER_STATE_KEY) + } + + fn get_flushed_index(&self, _raft_group_id: u64, _cf: &str) -> Result> { + panic!() + } + + fn get_dirty_mark(&self, _raft_group_id: u64, _tablet_index: u64) -> Result { + panic!() + } +} + +impl RaftEngineDebug for PSEngine { + fn scan_entries(&self, raft_group_id: u64, mut f: F) -> Result<()> + where + F: FnMut(&Entry) -> Result, + { + let start_key = keys::raft_log_key(raft_group_id, 0); + let end_key = keys::raft_log_key(raft_group_id, u64::MAX); + self.scan(&start_key, &end_key, |_, value| { + let mut entry = Entry::default(); + entry.merge_from_bytes(value)?; + f(&entry) + })?; + Ok(()) + } +} + +impl RaftEngine for PSEngine { + type LogBatch = PSEngineWriteBatch; + + fn log_batch(&self, capacity: usize) -> Self::LogBatch { + PSEngineWriteBatch::new(self.engine_store_server_helper) + } + + fn sync(&self) -> Result<()> { + Ok(()) + } + + fn consume(&self, batch: &mut Self::LogBatch, sync_log: bool) -> Result { + let bytes = batch.data_size(); + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.consume_write_batch(batch.raw_write_batch.ptr); + batch.clear(); + Ok(bytes) + } + + fn consume_and_shrink( + &self, + batch: &mut Self::LogBatch, + sync_log: bool, + max_capacity: usize, + shrink_to: usize, + ) -> Result { + self.consume(batch, sync_log) + } + + fn clean( + &self, + raft_group_id: u64, + mut first_index: u64, + state: &RaftLocalState, + batch: &mut Self::LogBatch, + ) -> Result<()> { + // info!("try clean raft_group_id {} from {} to {}", raft_group_id, first_index, + // state.last_index); + batch.del_page(&keys::raft_state_key(raft_group_id))?; + batch.del_page(&keys::region_state_key(raft_group_id))?; + batch.del_page(&keys::apply_state_key(raft_group_id))?; + if first_index == 0 { + let start_key = keys::raft_log_key(raft_group_id, 0); + let prefix = keys::raft_log_prefix(raft_group_id); + // TODO: make sure the seek can skip other raft related key and to the first log + // key + match self.seek(&start_key) { + Some(target_key) if target_key.starts_with(&prefix) => { + first_index = box_try!(keys::raft_log_index(&target_key)) + } + // No need to gc. + _ => return Ok(()), + } + } + if first_index >= state.last_index { + return Ok(()); + } + info!( + "clean raft_group_id {} from {} to {}", + raft_group_id, first_index, state.last_index + ); + // TODO: find the first raft log index of this raft group + if first_index <= state.last_index { + for index in first_index..=state.last_index { + batch.del_page(&keys::raft_log_key(raft_group_id, index))?; + } + } + self.consume(batch, true)?; + Ok(()) + } + + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { + self.gc_impl(raft_group_id, from, to, batch)?; + Ok(()) + } + + fn delete_all_but_one_states_before( + &self, + _raft_group_id: u64, + _apply_index: u64, + _batch: &mut Self::LogBatch, + ) -> Result<()> { + panic!() + } + + fn flush_metrics(&self, instance: &str) {} + + fn dump_stats(&self) -> Result { + Ok(String::from("")) + } + + fn get_engine_path(&self) -> &str { + "" + } + + fn get_engine_size(&self) -> Result { + Ok(0) + } + + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, + { + let start_key = keys::REGION_META_MIN_KEY; + let end_key = keys::REGION_META_MAX_KEY; + let mut err = None; + self.scan(start_key, end_key, |key, _| { + let (region_id, suffix) = box_try!(keys::decode_region_meta_key(key)); + if suffix != keys::REGION_STATE_SUFFIX { + return Ok(true); + } + + match f(region_id) { + Ok(()) => Ok(true), + Err(e) => { + err = Some(e); + Ok(false) + } + } + })?; + match err { + None => Ok(()), + Some(e) => Err(e), + } + } +} + +impl PerfContextExt for PSEngine { + type PerfContext = PSPerfContext; + + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + PSPerfContext::new(level, kind) + } +} + +#[derive(Debug)] +pub struct PSPerfContext {} + +impl PSPerfContext { + pub fn new(level: PerfLevel, kind: PerfContextKind) -> Self { + PSPerfContext {} + } +} + +impl PerfContext for PSPerfContext { + fn start_observe(&mut self) {} + + fn report_metrics(&mut self, trackers: &[TrackerToken]) {} +} diff --git a/engine_tiflash/Cargo.toml b/engine_tiflash/Cargo.toml index 09e55f8d60c..6969c39fc1a 100644 --- a/engine_tiflash/Cargo.toml +++ b/engine_tiflash/Cargo.toml @@ -10,6 +10,7 @@ portable = ["rocksdb/portable"] sse = ["rocksdb/sse"] failpoints = ["fail/failpoints"] testexport = [] +enable-pagestorage = [] # Disables runtime checks of invariants required by RocksDB that are redundant # with assertions inside RocksDB itself. This makes it possible to test those @@ -41,6 +42,7 @@ libc = "0.2" log_wrappers = { workspace = true } num_cpus = "1" online_config = { workspace = true } +portable-atomic = "0.3" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = "2" diff --git a/engine_tiflash/src/cached_region_info_manager.rs b/engine_tiflash/src/cached_region_info_manager.rs new file mode 100644 index 00000000000..6484764b2ab --- /dev/null +++ b/engine_tiflash/src/cached_region_info_manager.rs @@ -0,0 +1,156 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::hash_map::Entry as MapEntry, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, RwLock, + }, +}; + +use collections::HashMap; +use tikv_util::{error, info}; + +const CACHED_REGION_INFO_SLOT_COUNT: usize = 256; + +pub type Result = std::result::Result>; + +#[derive(Debug, Default)] +pub struct CachedRegionInfo { + pub replicated_or_created: AtomicBool, + // TiKV assumes a region's learner peer is added through snapshot. + // If this field is false, will try fast path when meet MsgAppend. + // If this field is true, it means this peer is inited or will be inited by a TiKV snapshot. + // NOTE If we want a fallback, then we must set inited_or_fallback to true, + // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. + pub inited_or_fallback: AtomicBool, + pub snapshot_inflight: portable_atomic::AtomicU128, + pub fast_add_peer_start: portable_atomic::AtomicU128, +} + +pub type CachedRegionInfoMap = HashMap>; + +pub struct CachedRegionInfoManager { + pub cached_region_info: Arc>>, +} + +impl CachedRegionInfoManager { + // Credit: [splitmix64 algorithm](https://xorshift.di.unimi.it/splitmix64.c) + #[inline] + fn hash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); + i = (i ^ (i >> 27)).wrapping_mul(0x94d049bb133111eb); + i ^ (i >> 31) + } + + #[allow(dead_code)] + #[inline] + fn unhash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 31) ^ (i >> 62)).wrapping_mul(0x319642b2d24d8ec3); + i = (i ^ (i >> 27) ^ (i >> 54)).wrapping_mul(0x96de1b173f119089); + i ^ (i >> 30) ^ (i >> 60) + } + + pub fn new() -> Self { + let mut cached_region_info = Vec::with_capacity(CACHED_REGION_INFO_SLOT_COUNT); + for _ in 0..CACHED_REGION_INFO_SLOT_COUNT { + cached_region_info.push(RwLock::new(HashMap::default())); + } + Self { + cached_region_info: Arc::new(cached_region_info), + } + } + + #[inline] + fn slot_index(id: u64) -> usize { + debug_assert!(CACHED_REGION_INFO_SLOT_COUNT.is_power_of_two()); + Self::hash_u64(id) as usize & (CACHED_REGION_INFO_SLOT_COUNT - 1) + } + + pub fn access_cached_region_info_mut>)>( + &self, + region_id: u64, + mut f: F, + ) -> Result<()> { + let slot_id = Self::slot_index(region_id); + let mut guard = match self.cached_region_info.get(slot_id).unwrap().write() { + Ok(g) => g, + Err(_) => return Err("access_cached_region_info_mut poisoned".into()), + }; + f(guard.entry(region_id)); + Ok(()) + } + + pub fn access_cached_region_info)>( + &self, + region_id: u64, + mut f: F, + ) { + let slot_id = Self::slot_index(region_id); + let guard = match self.cached_region_info.get(slot_id).unwrap().read() { + Ok(g) => g, + Err(_) => panic!("access_cached_region_info poisoned!"), + }; + match guard.get(®ion_id) { + Some(g) => f(g.clone()), + None => (), + } + } + + pub fn get_inited_or_fallback(&self, region_id: u64) -> Option { + let mut result: Option = None; + let f = |info: Arc| { + result = Some(info.inited_or_fallback.load(Ordering::SeqCst)); + }; + self.access_cached_region_info(region_id, f); + result + } + + pub fn remove_cached_region_info(&self, region_id: u64) { + let slot_id = Self::slot_index(region_id); + if let Ok(mut g) = self.cached_region_info.get(slot_id).unwrap().write() { + info!( + "remove_cached_region_info"; + "region_id" => region_id, + ); + let _ = g.remove(®ion_id); + } + } + + pub fn set_inited_or_fallback(&self, region_id: u64, v: bool) -> Result<()> { + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut().inited_or_fallback.store(v, Ordering::SeqCst); + } + MapEntry::Vacant(_) => { + tikv_util::safe_panic!("not inited!"); + } + }, + ) + } + + pub fn set_snapshot_inflight(&self, region_id: u64, v: u128) -> Result<()> { + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut().snapshot_inflight.store(v, Ordering::SeqCst); + } + MapEntry::Vacant(_) => { + tikv_util::safe_panic!("not inited!"); + } + }, + ) + } + + pub fn fallback_to_slow_path(&self, region_id: u64) { + // TODO clean local, and prepare to request snapshot from TiKV as a trivial + // procedure. + fail::fail_point!("fallback_to_slow_path_not_allow", |_| {}); + if self.set_inited_or_fallback(region_id, true).is_err() { + tikv_util::safe_panic!("set_inited_or_fallback"); + } + } +} diff --git a/engine_tiflash/src/engine.rs b/engine_tiflash/src/engine.rs index 8b53f342fc1..44dc6566353 100644 --- a/engine_tiflash/src/engine.rs +++ b/engine_tiflash/src/engine.rs @@ -3,19 +3,20 @@ #![allow(dead_code)] #![allow(unused_variables)] use std::{ - fmt::Formatter, + fmt::{self, Debug, Formatter}, fs, + ops::Deref, path::Path, sync::{ - atomic::{AtomicUsize, Ordering}, + atomic::{AtomicIsize, Ordering}, Arc, }, }; use engine_rocks::{RocksDbVector, RocksEngineIterator, RocksSnapshot}; use engine_traits::{ - Checkpointable, Checkpointer, Error, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, - Result, SyncMutable, + Checkpointable, Checkpointer, DbVector, Error, IterOptions, Iterable, KvEngine, Peekable, + ReadOptions, Result, SyncMutable, }; use rocksdb::{Writable, DB}; @@ -27,8 +28,48 @@ pub struct FsStatsExt { pub available: u64, } +pub type RawPSWriteBatchPtr = *mut ::std::os::raw::c_void; +pub type RawPSWriteBatchWrapperTag = u32; + +// This is just a copy from engine_store_ffi::RawCppPtr +#[repr(C)] +#[derive(Debug)] +pub struct RawPSWriteBatchWrapper { + pub ptr: RawPSWriteBatchPtr, + pub type_: RawPSWriteBatchWrapperTag, +} + +unsafe impl Send for RawPSWriteBatchWrapper {} + pub trait FFIHubInner { fn get_store_stats(&self) -> FsStatsExt; + + fn create_write_batch(&self) -> RawPSWriteBatchWrapper; + + fn destroy_write_batch(&self, wb_wrapper: &RawPSWriteBatchWrapper); + + fn consume_write_batch(&self, wb: RawPSWriteBatchPtr); + + fn write_batch_size(&self, wb: RawPSWriteBatchPtr) -> usize; + + fn write_batch_is_empty(&self, wb: RawPSWriteBatchPtr) -> bool; + + fn write_batch_merge(&self, lwb: RawPSWriteBatchPtr, rwb: RawPSWriteBatchPtr); + + fn write_batch_clear(&self, wb: RawPSWriteBatchPtr); + + fn write_batch_put_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8], page: &[u8]); + + fn write_batch_del_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8]); + + fn read_page(&self, page_id: &[u8]) -> Option>; + + fn scan_page( + &self, + start_page_id: &[u8], + end_page_id: &[u8], + f: &mut dyn FnMut(&[u8], &[u8]) -> Result, + ); } pub trait FFIHub: FFIHubInner + Send + Sync {} @@ -40,8 +81,10 @@ pub struct RocksEngine { pub rocks: engine_rocks::RocksEngine, pub engine_store_server_helper: isize, pub pool_capacity: usize, - pub pending_applies_count: Arc, + pub pending_applies_count: Arc, pub ffi_hub: Option>, + pub config_set: Option>, + pub cached_region_info_manager: Option>, } impl std::fmt::Debug for RocksEngine { @@ -66,11 +109,18 @@ impl RocksEngine { engine_store_server_helper: isize, snap_handle_pool_size: usize, ffi_hub: Option>, + config_set: Option>, ) { + #[cfg(feature = "enable-pagestorage")] + tikv_util::info!("enabled pagestorage"); + #[cfg(not(feature = "enable-pagestorage"))] + tikv_util::info!("disabled pagestorage"); self.engine_store_server_helper = engine_store_server_helper; self.pool_capacity = snap_handle_pool_size; self.pending_applies_count.store(0, Ordering::SeqCst); self.ffi_hub = ffi_hub; + self.config_set = config_set; + self.cached_region_info_manager = Some(Arc::new(crate::CachedRegionInfoManager::new())) } pub fn from_rocks(rocks: engine_rocks::RocksEngine) -> Self { @@ -78,8 +128,10 @@ impl RocksEngine { rocks, engine_store_server_helper: 0, pool_capacity: 0, - pending_applies_count: Arc::new(AtomicUsize::new(0)), + pending_applies_count: Arc::new(AtomicIsize::new(0)), ffi_hub: None, + config_set: None, + cached_region_info_manager: None, } } @@ -88,8 +140,10 @@ impl RocksEngine { rocks: engine_rocks::RocksEngine::from_db(db), engine_store_server_helper: 0, pool_capacity: 0, - pending_applies_count: Arc::new(AtomicUsize::new(0)), + pending_applies_count: Arc::new(AtomicIsize::new(0)), ffi_hub: None, + config_set: None, + cached_region_info_manager: None, } } @@ -147,19 +201,44 @@ impl KvEngine for RocksEngine { // new task, or when `handle_pending_applies` need to handle multiple // snapshots. We need to compare to what's in queue. - fn can_apply_snapshot(&self, is_timeout: bool, new_batch: bool, _region_id: u64) -> bool { + fn can_apply_snapshot(&self, is_timeout: bool, new_batch: bool, region_id: u64) -> bool { + fail::fail_point!("on_can_apply_snapshot", |e| e + .unwrap() + .parse::() + .unwrap()); + if let Some(s) = self.config_set.as_ref() { + if s.engine_store.enable_fast_add_peer { + // TODO Return true if this is an empty snapshot. + // We need to test if the region is still in fast add peer mode. + let result = self + .cached_region_info_manager + .as_ref() + .expect("expect cached_region_info_manager") + .get_inited_or_fallback(region_id); + match result { + Some(true) => { + // Do nothing. + tikv_util::debug!("can_apply_snapshot no fast path. do normal checking"; + "region_id" => region_id, + ); + } + None | Some(false) => { + // Otherwise, try fast path. + return true; + } + }; + } + } // is called after calling observer's pre_handle_snapshot let in_queue = self.pending_applies_count.load(Ordering::SeqCst); - // if queue is full, we should begin to handle let can = if is_timeout && new_batch { + // If queue is full, we should begin to handle true } else { - in_queue > self.pool_capacity + // Otherwise, we wait until the queue is full. + // In order to batch more tasks. + in_queue > (self.pool_capacity as isize) }; - fail::fail_point!("on_can_apply_snapshot", |e| e - .unwrap() - .parse::() - .unwrap()); can } } @@ -167,18 +246,71 @@ impl KvEngine for RocksEngine { impl Iterable for RocksEngine { type Iterator = RocksEngineIterator; + #[cfg(feature = "enable-pagestorage")] + fn scan( + &self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + fill_cache: bool, + f: F, + ) -> Result<()> + where + F: FnMut(&[u8], &[u8]) -> Result, + { + let mut f = f; + self.ffi_hub + .as_ref() + .unwrap() + .scan_page(start_key.into(), end_key.into(), &mut f); + Ok(()) + } + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { self.rocks.iterator_opt(cf, opts) } } +pub struct PsDbVector(Vec); + +impl PsDbVector { + pub fn from_raw(raw: Vec) -> PsDbVector { + PsDbVector(raw) + } +} + +impl DbVector for PsDbVector {} + +impl Deref for PsDbVector { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &self.0 + } +} + +impl Debug for PsDbVector { + fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { + write!(formatter, "{:?}", &**self) + } +} + +impl<'a> PartialEq<&'a [u8]> for PsDbVector { + fn eq(&self, rhs: &&[u8]) -> bool { + **rhs == **self + } +} + impl Peekable for RocksEngine { + #[cfg(not(feature = "enable-pagestorage"))] type DbVector = RocksDbVector; + #[cfg(not(feature = "enable-pagestorage"))] fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { self.rocks.get_value_opt(opts, key) } + #[cfg(not(feature = "enable-pagestorage"))] fn get_value_cf_opt( &self, opts: &ReadOptions, @@ -187,6 +319,28 @@ impl Peekable for RocksEngine { ) -> Result> { self.rocks.get_value_cf_opt(opts, cf, key) } + + #[cfg(feature = "enable-pagestorage")] + type DbVector = PsDbVector; + + #[cfg(feature = "enable-pagestorage")] + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + let result = self.ffi_hub.as_ref().unwrap().read_page(key); + return match result { + None => Ok(None), + Some(v) => Ok(Some(PsDbVector::from_raw(v))), + }; + } + + #[cfg(feature = "enable-pagestorage")] + fn get_value_cf_opt( + &self, + opts: &ReadOptions, + cf: &str, + key: &[u8], + ) -> Result> { + self.get_value_opt(opts, key) + } } impl RocksEngine { @@ -196,6 +350,7 @@ impl RocksEngine { } impl SyncMutable for RocksEngine { + #[cfg(not(feature = "enable-pagestorage"))] fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { if self.do_write(engine_traits::CF_DEFAULT, key) { return self.rocks.get_sync_db().put(key, value).map_err(r2e); @@ -203,6 +358,7 @@ impl SyncMutable for RocksEngine { Ok(()) } + #[cfg(not(feature = "enable-pagestorage"))] fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { if self.do_write(cf, key) { let db = self.rocks.get_sync_db(); @@ -216,6 +372,7 @@ impl SyncMutable for RocksEngine { Ok(()) } + #[cfg(not(feature = "enable-pagestorage"))] fn delete(&self, key: &[u8]) -> Result<()> { if self.do_write(engine_traits::CF_DEFAULT, key) { return self.rocks.get_sync_db().delete(key).map_err(r2e); @@ -223,6 +380,7 @@ impl SyncMutable for RocksEngine { Ok(()) } + #[cfg(not(feature = "enable-pagestorage"))] fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { if self.do_write(cf, key) { let db = self.rocks.get_sync_db(); @@ -232,6 +390,70 @@ impl SyncMutable for RocksEngine { Ok(()) } + #[cfg(feature = "enable-pagestorage")] + fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + if self.do_write(engine_traits::CF_DEFAULT, key) { + let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_put_page(ps_wb.ptr, key, value); + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(ps_wb.ptr); + } + Ok(()) + } + + #[cfg(feature = "enable-pagestorage")] + fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + if self.do_write(cf, key) { + let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_put_page(ps_wb.ptr, key, value); + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(ps_wb.ptr); + } + Ok(()) + } + + #[cfg(feature = "enable-pagestorage")] + fn delete(&self, key: &[u8]) -> Result<()> { + if self.do_write(engine_traits::CF_DEFAULT, key) { + let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_del_page(ps_wb.ptr, key); + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(ps_wb.ptr); + } + Ok(()) + } + + #[cfg(feature = "enable-pagestorage")] + fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { + if self.do_write(cf, key) { + let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_del_page(ps_wb.ptr, key); + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(ps_wb.ptr); + } + Ok(()) + } + fn delete_range(&self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { // do nothing Ok(()) diff --git a/engine_tiflash/src/lib.rs b/engine_tiflash/src/lib.rs index 4ff0baf2f8d..882ae194f68 100644 --- a/engine_tiflash/src/lib.rs +++ b/engine_tiflash/src/lib.rs @@ -15,9 +15,7 @@ //! //! Please read the engine_trait crate docs before hacking. #![allow(dead_code)] -#![feature(backtrace)] #![cfg_attr(test, feature(test))] -#![feature(generic_associated_types)] #![feature(let_chains)] #![feature(option_get_or_insert_default)] @@ -56,8 +54,17 @@ mod status; pub use crate::status::*; mod table_properties; pub use crate::table_properties::*; + +#[cfg(not(feature = "enable-pagestorage"))] mod write_batch; +#[cfg(not(feature = "enable-pagestorage"))] pub use crate::write_batch::*; + +#[cfg(feature = "enable-pagestorage")] +mod ps_write_batch; +#[cfg(feature = "enable-pagestorage")] +pub use crate::ps_write_batch::*; + pub mod mvcc_properties; pub use crate::mvcc_properties::*; pub mod perf_context; @@ -118,6 +125,8 @@ pub mod raw; mod proxy_utils; pub use proxy_utils::*; +mod cached_region_info_manager; +pub use cached_region_info_manager::*; pub use rocksdb::DB; pub fn get_env( diff --git a/engine_tiflash/src/misc.rs b/engine_tiflash/src/misc.rs index 0393a96bd02..70d94e567ae 100644 --- a/engine_tiflash/src/misc.rs +++ b/engine_tiflash/src/misc.rs @@ -315,6 +315,18 @@ impl MiscExt for RocksEngine { .get_property_int_cf(handle, ROCKSDB_TOTAL_SST_FILES_SIZE)) } + fn get_num_keys(&self) -> Result { + let mut total = 0; + for cf in self.cf_names() { + let handle = util::get_cf_handle(self.as_inner(), cf).unwrap(); + total += self + .as_inner() + .get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) + .unwrap_or_default(); + } + Ok(total) + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/engine_tiflash/src/perf_context.rs b/engine_tiflash/src/perf_context.rs index a731a9461dc..f8cfdbcc667 100644 --- a/engine_tiflash/src/perf_context.rs +++ b/engine_tiflash/src/perf_context.rs @@ -8,7 +8,7 @@ use crate::{engine::RocksEngine, perf_context_impl::PerfContextStatistics}; impl PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { RocksPerfContext::new(level, kind) } } diff --git a/engine_tiflash/src/proxy_utils.rs b/engine_tiflash/src/proxy_utils.rs index c44e355ae59..be7eb7cc2e9 100644 --- a/engine_tiflash/src/proxy_utils.rs +++ b/engine_tiflash/src/proxy_utils.rs @@ -71,3 +71,26 @@ pub fn log_check_double_write(batch: &crate::RocksWriteBatchVec) -> bool { } false } + +use serde_derive::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct EngineStoreConfig { + pub enable_fast_add_peer: bool, +} + +#[allow(clippy::derivable_impls)] +impl Default for EngineStoreConfig { + fn default() -> Self { + Self { + enable_fast_add_peer: false, + } + } +} + +#[derive(Default, Debug)] +pub struct ProxyConfigSet { + pub engine_store: EngineStoreConfig, +} diff --git a/engine_tiflash/src/ps_write_batch.rs b/engine_tiflash/src/ps_write_batch.rs new file mode 100644 index 00000000000..b7e895fb29b --- /dev/null +++ b/engine_tiflash/src/ps_write_batch.rs @@ -0,0 +1,365 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use engine_traits::{self, Mutable, Result, WriteBatchExt, WriteOptions}; +use rocksdb::{Writable, WriteBatch as RawWriteBatch, DB}; + +use crate::{engine::RocksEngine, r2e, FFIHubInner, RawPSWriteBatchWrapper}; + +const WRITE_BATCH_MAX_BATCH: usize = 16; +const WRITE_BATCH_LIMIT: usize = 16; + +impl WriteBatchExt for RocksEngine { + type WriteBatch = RocksWriteBatchVec; + + const WRITE_BATCH_MAX_KEYS: usize = 256; + + fn write_batch(&self) -> RocksWriteBatchVec { + RocksWriteBatchVec::new( + Arc::clone(self.as_inner()), + self.ffi_hub.clone(), + self.ffi_hub.as_ref().unwrap().create_write_batch(), + WRITE_BATCH_LIMIT, + 1, + self.support_multi_batch_write(), + ) + } + + fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { + RocksWriteBatchVec::with_unit_capacity( + self, + self.ffi_hub.as_ref().unwrap().create_write_batch(), + cap, + ) + } +} + +/// `RocksWriteBatchVec` is for method `MultiBatchWrite` of RocksDB, which +/// splits a large WriteBatch into many smaller ones and then any thread could +/// help to deal with these small WriteBatch when it is calling +/// `MultiBatchCommit` and wait the front writer to finish writing. +/// `MultiBatchWrite` will perform much better than traditional +/// `pipelined_write` when TiKV writes very large data into RocksDB. +/// We will remove this feature when `unordered_write` of RocksDB becomes more +/// stable and becomes compatible with Titan. +pub struct RocksWriteBatchVec { + pub db: Arc, + pub wbs: Vec, + pub ffi_hub: Option>, + pub ps_wb: RawPSWriteBatchWrapper, + save_points: Vec, + index: usize, + batch_size_limit: usize, + support_write_batch_vec: bool, +} + +impl Drop for RocksWriteBatchVec { + fn drop(&mut self) { + if !self.ps_wb.ptr.is_null() { + self.ffi_hub + .as_ref() + .unwrap() + .destroy_write_batch(&self.ps_wb); + } + self.ps_wb.ptr = std::ptr::null_mut(); + } +} + +impl RocksWriteBatchVec { + pub fn new( + db: Arc, + ffi_hub: Option>, + ps_wb: RawPSWriteBatchWrapper, + batch_size_limit: usize, + cap: usize, + support_write_batch_vec: bool, + ) -> RocksWriteBatchVec { + let wb = RawWriteBatch::with_capacity(cap); + RocksWriteBatchVec { + db, + wbs: vec![wb], + ffi_hub, + ps_wb, + save_points: vec![], + index: 0, + batch_size_limit, + support_write_batch_vec, + } + } + + pub fn with_unit_capacity( + engine: &RocksEngine, + ps_wb: RawPSWriteBatchWrapper, + cap: usize, + ) -> RocksWriteBatchVec { + Self::new( + engine.as_inner().clone(), + engine.ffi_hub.clone(), + ps_wb, + WRITE_BATCH_LIMIT, + cap, + engine.support_multi_batch_write(), + ) + } + + pub fn as_inner(&self) -> &[RawWriteBatch] { + &self.wbs[0..=self.index] + } + + pub fn get_db(&self) -> &DB { + self.db.as_ref() + } + + /// `check_switch_batch` will split a large WriteBatch into many smaller + /// ones. This is to avoid a large WriteBatch blocking write_thread too + /// long. + #[inline(always)] + fn check_switch_batch(&mut self) { + if self.support_write_batch_vec + && self.batch_size_limit > 0 + && self.wbs[self.index].count() >= self.batch_size_limit + { + self.index += 1; + if self.index >= self.wbs.len() { + self.wbs.push(RawWriteBatch::default()); + } + } + } +} + +impl engine_traits::WriteBatch for RocksWriteBatchVec { + fn write_opt(&mut self, opts: &WriteOptions) -> Result { + // write into ps + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(self.ps_wb.ptr); + Ok(self + .ffi_hub + .as_ref() + .unwrap() + .write_batch_size(self.ps_wb.ptr) as u64) + } + + fn data_size(&self) -> usize { + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_size(self.ps_wb.ptr) + } + + fn count(&self) -> usize { + // FIXME + 0 + } + + fn is_empty(&self) -> bool { + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_is_empty(self.ps_wb.ptr) + } + + fn should_write_to_engine(&self) -> bool { + // Disable TiKV's logic, and using Proxy's instead. + false + } + + fn clear(&mut self) { + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_clear(self.ps_wb.ptr); + } + + fn set_save_point(&mut self) { + self.wbs[self.index].set_save_point(); + self.save_points.push(self.index); + } + + fn pop_save_point(&mut self) -> Result<()> { + if let Some(x) = self.save_points.pop() { + return self.wbs[x].pop_save_point().map_err(r2e); + } + Err(r2e("no save point")) + } + + fn rollback_to_save_point(&mut self) -> Result<()> { + if let Some(x) = self.save_points.pop() { + for i in x + 1..=self.index { + self.wbs[i].clear(); + } + self.index = x; + return self.wbs[x].rollback_to_save_point().map_err(r2e); + } + Err(r2e("no save point")) + } + + fn merge(&mut self, other: Self) -> Result<()> { + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_merge(self.ps_wb.ptr, other.ps_wb.ptr); + Ok(()) + } +} + +impl RocksWriteBatchVec { + fn do_write(&self, cf: &str, key: &[u8]) -> bool { + crate::do_write(cf, key) + } +} + +impl Mutable for RocksWriteBatchVec { + fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { + if !self.do_write(engine_traits::CF_DEFAULT, key) { + return Ok(()); + } + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_put_page(self.ps_wb.ptr, key, value); + Ok(()) + } + + fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + if !self.do_write(cf, key) { + return Ok(()); + } + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_put_page(self.ps_wb.ptr, key, value); + Ok(()) + } + + fn delete(&mut self, key: &[u8]) -> Result<()> { + if !self.do_write(engine_traits::CF_DEFAULT, key) { + return Ok(()); + } + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_del_page(self.ps_wb.ptr, key); + Ok(()) + } + + fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + if !self.do_write(cf, key) { + return Ok(()); + } + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_del_page(self.ps_wb.ptr, key); + Ok(()) + } + + fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + Ok(()) + } + + fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use engine_traits::{Peekable, WriteBatch, CF_DEFAULT}; + use rocksdb::DBOptions as RawDBOptions; + use tempfile::Builder; + + use super::{ + super::{util::new_engine_opt, RocksDbOptions}, + *, + }; + use crate::RocksCfOptions; + + #[test] + fn test_should_write_to_engine_with_pipeline_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let opt = RawDBOptions::default(); + opt.enable_unordered_write(false); + opt.enable_pipelined_write(true); + opt.enable_multi_batch_write(false); + let engine = new_engine_opt( + path.path().join("db").to_str().unwrap(), + RocksDbOptions::from_raw(opt), + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); + assert!( + !engine + .as_inner() + .get_db_options() + .is_enable_multi_batch_write() + ); + let mut wb = engine.write_batch(); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.write().unwrap(); + + let v = engine.get_value(b"aaa").unwrap(); + + assert!(v.is_some()); + assert_eq!(v.unwrap(), b"bbb"); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } + + #[test] + fn test_should_write_to_engine_with_multi_batch_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let opt = RawDBOptions::default(); + opt.enable_unordered_write(false); + opt.enable_pipelined_write(false); + opt.enable_multi_batch_write(true); + let engine = new_engine_opt( + path.path().join("db").to_str().unwrap(), + RocksDbOptions::from_raw(opt), + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); + assert!( + engine + .as_inner() + .get_db_options() + .is_enable_multi_batch_write() + ); + let mut wb = engine.write_batch(); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..WRITE_BATCH_MAX_BATCH * WRITE_BATCH_LIMIT { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } +} diff --git a/engine_tiflash/src/raft_engine.rs b/engine_tiflash/src/raft_engine.rs index d5331a2ce29..d4a8912b9bf 100644 --- a/engine_tiflash/src/raft_engine.rs +++ b/engine_tiflash/src/raft_engine.rs @@ -166,6 +166,10 @@ impl RaftEngineReadOnly for RocksEngine { panic!() } + fn get_dirty_mark(&self, _raft_group_id: u64, _tablet_index: u64) -> Result { + panic!() + } + fn get_recover_state(&self) -> Result> { self.get_msg_cf(CF_DEFAULT, keys::RECOVER_STATE_KEY) } @@ -232,7 +236,11 @@ impl RaftEngine for RocksEngine { type LogBatch = RocksWriteBatchVec; fn log_batch(&self, capacity: usize) -> Self::LogBatch { - RocksWriteBatchVec::with_unit_capacity(self, capacity) + RocksWriteBatchVec::with_unit_capacity( + self, + self.ffi_hub.as_ref().unwrap().create_write_batch(), + capacity, + ) } fn sync(&self) -> Result<()> { @@ -361,7 +369,19 @@ impl RaftEngine for RocksEngine { } impl RaftLogBatch for RocksWriteBatchVec { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + let overwrite_to = overwrite_to.unwrap_or(0); + if let Some(last) = entries.last() && last.get_index() + 1 < overwrite_to { + for index in last.get_index() + 1..overwrite_to { + let key = keys::raft_log_key(raft_group_id, index); + self.delete(&key).unwrap(); + } + } if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let ser_buf = Vec::with_capacity(max_size as usize); return self.append_impl(raft_group_id, &entries, ser_buf); @@ -369,13 +389,6 @@ impl RaftLogBatch for RocksWriteBatchVec { Ok(()) } - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { - for index in from..to { - let key = keys::raft_log_key(raft_group_id, index); - self.delete(&key).unwrap(); - } - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.put_msg(&keys::raft_state_key(raft_group_id), state) } @@ -434,6 +447,15 @@ impl RaftLogBatch for RocksWriteBatchVec { panic!() } + fn put_dirty_mark( + &mut self, + _raft_group_id: u64, + _tablet_index: u64, + _dirty: bool, + ) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.put_msg(keys::RECOVER_STATE_KEY, state) } diff --git a/engine_tiflash/src/snapshot.rs b/engine_tiflash/src/snapshot.rs index b19a32fd739..60a12c4ac6d 100644 --- a/engine_tiflash/src/snapshot.rs +++ b/engine_tiflash/src/snapshot.rs @@ -5,7 +5,9 @@ use std::{ sync::Arc, }; -use engine_traits::{self, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + self, CfNamesExt, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot, +}; use rocksdb::{rocksdb_options::UnsafeSnap, DBIterator, DB}; use crate::{ @@ -95,3 +97,9 @@ impl Peekable for RocksSnapshot { Ok(v.map(RocksDbVector::from_raw)) } } + +impl CfNamesExt for RocksSnapshot { + fn cf_names(&self) -> Vec<&str> { + self.db.cf_names() + } +} diff --git a/engine_tiflash/src/write_batch.rs b/engine_tiflash/src/write_batch.rs index fef608e328e..42e218a53b3 100644 --- a/engine_tiflash/src/write_batch.rs +++ b/engine_tiflash/src/write_batch.rs @@ -5,7 +5,10 @@ use std::sync::Arc; use engine_traits::{self, Mutable, Result, WriteBatchExt, WriteOptions}; use rocksdb::{Writable, WriteBatch as RawWriteBatch, DB}; -use crate::{engine::RocksEngine, options::RocksWriteOptions, r2e, util::get_cf_handle}; +use crate::{ + engine::RocksEngine, options::RocksWriteOptions, r2e, util::get_cf_handle, FFIHubInner, + RawPSWriteBatchWrapper, +}; const WRITE_BATCH_MAX_BATCH: usize = 16; const WRITE_BATCH_LIMIT: usize = 16; @@ -18,6 +21,8 @@ impl WriteBatchExt for RocksEngine { fn write_batch(&self) -> RocksWriteBatchVec { RocksWriteBatchVec::new( Arc::clone(self.as_inner()), + self.ffi_hub.clone(), + self.ffi_hub.as_ref().unwrap().create_write_batch(), WRITE_BATCH_LIMIT, 1, self.support_multi_batch_write(), @@ -25,7 +30,11 @@ impl WriteBatchExt for RocksEngine { } fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { - RocksWriteBatchVec::with_unit_capacity(self, cap) + RocksWriteBatchVec::with_unit_capacity( + self, + self.ffi_hub.as_ref().unwrap().create_write_batch(), + cap, + ) } } @@ -49,6 +58,8 @@ pub struct RocksWriteBatchVec { impl RocksWriteBatchVec { pub fn new( db: Arc, + _ffi_hub: Option>, + _ps_wb: RawPSWriteBatchWrapper, batch_size_limit: usize, cap: usize, support_write_batch_vec: bool, @@ -64,9 +75,15 @@ impl RocksWriteBatchVec { } } - pub fn with_unit_capacity(engine: &RocksEngine, cap: usize) -> RocksWriteBatchVec { + pub fn with_unit_capacity( + engine: &RocksEngine, + ps_wb: RawPSWriteBatchWrapper, + cap: usize, + ) -> RocksWriteBatchVec { Self::new( engine.as_inner().clone(), + engine.ffi_hub.clone(), + ps_wb, WRITE_BATCH_LIMIT, cap, engine.support_multi_batch_write(), @@ -292,7 +309,14 @@ mod tests { assert!(v.is_some()); assert_eq!(v.unwrap(), b"bbb"); - let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + let mut wb = RocksWriteBatchVec::with_unit_capacity( + &engine, + RawPSWriteBatchWrapper { + ptr: std::ptr::null_mut(), + type_: 0, + }, + 1024, + ); for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { wb.put(b"aaa", b"bbb").unwrap(); } @@ -332,7 +356,14 @@ mod tests { assert!(!wb.should_write_to_engine()); wb.put(b"aaa", b"bbb").unwrap(); assert!(wb.should_write_to_engine()); - let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + let mut wb = RocksWriteBatchVec::with_unit_capacity( + &engine, + RawPSWriteBatchWrapper { + ptr: std::ptr::null_mut(), + type_: 0, + }, + 1024, + ); for _i in 0..WRITE_BATCH_MAX_BATCH * WRITE_BATCH_LIMIT { wb.put(b"aaa", b"bbb").unwrap(); } diff --git a/etc/config-template.toml b/etc/config-template.toml index 62623afed0e..59152570da1 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -679,7 +679,7 @@ ## The data block size. RocksDB compresses data based on the unit of block. ## Similar to page in other databases, block is the smallest unit cached in block-cache. Note that ## the block size specified here corresponds to uncompressed data. -# block-size = "64KB" +# block-size = "16KB" ## If you're doing point lookups you definitely want to turn bloom filters on. We use bloom filters ## to avoid unnecessary disk reads. Default bits_per_key is 10, which yields ~1% false positive @@ -915,7 +915,7 @@ [rocksdb.writecf] ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. # compression-per-level = ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] -# block-size = "64KB" +# block-size = "16KB" ## Recommend to set it the same as `rocksdb.defaultcf.write-buffer-size`. # write-buffer-size = "128MB" diff --git a/etc/error_code.toml b/etc/error_code.toml index 5cdd770f8d2..bb23c9b5e26 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -263,6 +263,11 @@ error = ''' KV:Pd:GlobalConfigNotFound ''' +["KV:Pd:DataCompacted"] +error = ''' +KV:Pd:DataCompacted +''' + ["KV:Pd:Unknown"] error = ''' KV:Pd:Unknown @@ -443,6 +448,11 @@ error = ''' KV:Raftstore:FlashbackNotPrepared ''' +["KV:Raftstore:IsWitness"] +error = ''' +KV:Raftstore:IsWitness +''' + ["KV:Raftstore:SnapAbort"] error = ''' KV:Raftstore:SnapAbort diff --git a/new-mock-engine-store/Cargo.toml b/new-mock-engine-store/Cargo.toml index a3bdbf6a7a7..f9adc9a0cbe 100644 --- a/new-mock-engine-store/Cargo.toml +++ b/new-mock-engine-store/Cargo.toml @@ -17,6 +17,7 @@ protobuf-codec = [ [dependencies] api_version = { workspace = true, default-features = false } +assert-type-eq = "0.1.0" causal_ts = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true, default-features = false } @@ -33,6 +34,7 @@ file_system = { workspace = true, default-features = false } futures = { version = "0.3", features = ["thread-pool", "compat"] } grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } +int-enum = "0.5" keys = { workspace = true, default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git", default-features = false } @@ -45,6 +47,8 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raftstore = { workspace = true, default-features = false } rand = "0.8" resolved_ts = { workspace = true } + +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true, default-features = false } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 58db2bb0f2c..22038ef4e54 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -1,8 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - +#![feature(vec_into_raw_parts)] #![feature(slice_take)] pub mod config; pub mod mock_cluster; +pub mod mock_page_storage; pub mod mock_store; pub mod node; pub mod server; @@ -74,6 +75,7 @@ pub fn copy_data_from( // kv data in memory for cf in 0..3 { for (k, v) in &source.data[cf] { + debug!("copy_data_from region {} {:?} {:?}", region_id, k, v); write_kv_in_mem(target, cf, k.as_slice(), v.as_slice()); } } @@ -87,7 +89,7 @@ pub fn copy_data_from( .unwrap(); debug!("copy raft log {:?}", entries); - raft_wb.append(region_id, entries)?; + raft_wb.append(region_id, None, entries)?; box_try!(target_engines.raft.consume(&mut raft_wb, true)); Ok(()) } diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index 9e793544ca4..e3005ae7f9d 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -44,6 +44,7 @@ use raftstore::{ }, Error, Result, }; +use resource_control::ResourceGroupManager; use tempfile::TempDir; pub use test_pd_client::TestPdClient; use test_raftstore::FilterFactory; @@ -93,7 +94,7 @@ pub struct TestData { pub struct Cluster> { // Helper to set ffi_helper_set. pub ffi_helper_lst: Vec, - ffi_helper_set: Arc>>, + pub ffi_helper_set: Arc>>, pub cfg: Config, leaders: HashMap, @@ -110,6 +111,7 @@ pub struct Cluster> { pub sim: Arc>, pub pd_client: Arc, pub test_data: TestData, + resource_manager: Option>, } impl> std::panic::UnwindSafe for Cluster {} @@ -154,6 +156,7 @@ impl> Cluster { expected_leader_safe_ts: 0, expected_self_safe_ts: 0, }, + resource_manager: Some(Arc::new(ResourceGroupManager::default())), } } @@ -235,14 +238,14 @@ impl> Cluster { pub fn iter_ffi_helpers( &self, store_ids: Option>, - f: &mut dyn FnMut(u64, &engine_rocks::RocksEngine, &mut FFIHelperSet), + f: &mut dyn FnMut(u64, &engine_store_ffi::TiFlashEngine, &mut FFIHelperSet), ) { let ids = match store_ids { Some(ids) => ids, None => self.engines.keys().copied().collect::>(), }; for id in ids { - let engine = self.get_engine(id); + let engine = self.get_tiflash_engine(id); let lock = self.ffi_helper_set.lock(); match lock { Ok(mut l) => { @@ -254,6 +257,16 @@ impl> Cluster { } } + pub fn access_ffi_helpers(&self, f: &mut dyn FnMut(&mut HashMap)) { + let lock = self.ffi_helper_set.lock(); + match lock { + Ok(mut l) => { + f(&mut l); + } + Err(_) => std::process::exit(1), + } + } + pub fn create_engines(&mut self) { self.io_rate_limiter = Some(Arc::new( self.cfg @@ -269,14 +282,12 @@ impl> Cluster { pub fn run(&mut self) { self.create_engines(); self.bootstrap_region().unwrap(); - self.bootstrap_ffi_helper_set(); self.start().unwrap(); } pub fn run_conf_change(&mut self) -> u64 { self.create_engines(); let region_id = self.bootstrap_conf_change(); - self.bootstrap_ffi_helper_set(); // Will not start new nodes in `start` self.start().unwrap(); region_id @@ -284,9 +295,7 @@ impl> Cluster { pub fn run_conf_change_no_start(&mut self) -> u64 { self.create_engines(); - let region_id = self.bootstrap_conf_change(); - self.bootstrap_ffi_helper_set(); - region_id + self.bootstrap_conf_change() } /// We need to create FFIHelperSet while we create engine. @@ -297,6 +306,7 @@ impl> Cluster { key_manager: &Option>, router: &Option>, ) { + init_global_ffi_helper_set(); let (mut ffi_helper_set, _node_cfg) = self.make_ffi_helper_set(0, engines, key_manager, router); @@ -312,17 +322,20 @@ impl> Cluster { .engine_store_server_helper; let helper = engine_store_ffi::gen_engine_store_server_helper(helper_ptr); - let ffi_hub = Arc::new(engine_store_ffi::observer::TiFlashFFIHub { + let ffi_hub = Arc::new(engine_store_ffi::TiFlashFFIHub { engine_store_server_helper: helper, }); (helper_ptr, ffi_hub) }; let engines = ffi_helper_set.engine_store_server.engines.as_mut().unwrap(); - + let proxy_config_set = Arc::new(engine_tiflash::ProxyConfigSet { + engine_store: self.cfg.proxy_cfg.engine_store.clone(), + }); engines.kv.init( helper_ptr, self.cfg.proxy_cfg.raft_store.snap_handle_pool_size, Some(ffi_hub), + Some(proxy_config_set), ); assert_ne!(engines.kv.engine_store_server_helper, 0); @@ -338,6 +351,7 @@ impl> Cluster { } else { self.ffi_helper_lst.pop().unwrap() }; + debug!("set up ffi helper set for {}", node_id); ffi_helper_set.engine_store_server.id = node_id; self.ffi_helper_set .lock() @@ -345,6 +359,7 @@ impl> Cluster { .insert(node_id, ffi_helper_set); } + // Need self.engines be filled. pub fn bootstrap_ffi_helper_set(&mut self) { let mut node_ids: Vec = self.engines.iter().map(|(&id, _)| id).collect(); // We force iterate engines in sorted order. @@ -378,8 +393,6 @@ impl> Cluster { } pub fn start_with(&mut self, skip_set: HashSet) -> ServerResult<()> { - init_global_ffi_helper_set(); - // Try recover from last shutdown. // `self.engines` is inited in bootstrap_region or bootstrap_conf_change. let mut node_ids: Vec = self.engines.iter().map(|(&id, _)| id).collect(); @@ -412,7 +425,8 @@ impl> Cluster { if !skip_set.is_empty() { panic!("Error when start with skip set"); } - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); self.create_engine(Some(router.clone())); let store_meta = Arc::new(Mutex::new(StoreMeta::new(PENDING_MSG_CAP))); @@ -486,6 +500,7 @@ pub fn make_global_ffi_helper_set_no_bind() -> (EngineHelperSet, *const u8) { pub fn init_global_ffi_helper_set() { unsafe { START.call_once(|| { + debug!("init_global_ffi_helper_set"); assert_eq!(engine_store_ffi::get_engine_store_server_helper_ptr(), 0); let (set, ptr) = make_global_ffi_helper_set_no_bind(); engine_store_ffi::init_engine_store_server_helper(ptr); @@ -518,9 +533,10 @@ pub fn create_tiflash_test_engine( let kv_path = dir.path().join(tikv::config::DEFAULT_ROCKSDB_SUB_DIR); let kv_path_str = kv_path.to_str().unwrap(); - let kv_db_opt = cfg - .rocksdb - .build_opt(&cfg.rocksdb.build_resources(env.clone())); + let kv_db_opt = cfg.rocksdb.build_opt( + &cfg.rocksdb.build_resources(env.clone()), + cfg.storage.engine, + ); let cache = cfg.storage.block_cache.build_shared_cache(); let raft_cfs_opt = cfg.raftdb.build_cf_opts(&cache); @@ -810,7 +826,8 @@ impl> Cluster { assert_ne!(engines.kv.engine_store_server_helper, 0); let key_mgr = self.key_managers_map[&node_id].clone(); - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); let mut cfg = self.cfg.clone(); if let Some(labels) = self.labels.get(&node_id) { @@ -879,6 +896,7 @@ impl> Cluster { .insert(id, self.key_managers[i].clone()); } + self.bootstrap_ffi_helper_set(); let mut region = metapb::Region::default(); region.set_id(1); region.set_start_key(keys::EMPTY_KEY.to_vec()); @@ -897,6 +915,9 @@ impl> Cluster { "node_id" => id, ); prepare_bootstrap_cluster(engines, ®ion)?; + tikv_util::debug!("prepare_bootstrap_cluster finish"; + "node_id" => id, + ); } self.bootstrap_cluster(region); @@ -915,6 +936,7 @@ impl> Cluster { .insert(id, self.key_managers[i].clone()); } + self.bootstrap_ffi_helper_set(); for (&id, engines) in &self.engines { bootstrap_store(engines, self.id(), id).unwrap(); } diff --git a/new-mock-engine-store/src/mock_page_storage.rs b/new-mock-engine-store/src/mock_page_storage.rs new file mode 100644 index 00000000000..ef2c23813fb --- /dev/null +++ b/new-mock-engine-store/src/mock_page_storage.rs @@ -0,0 +1,242 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use core::ops::Bound::{Excluded, Included, Unbounded}; +use std::{ + collections::BTreeMap, + sync::{atomic::AtomicU64, Arc, RwLock}, +}; + +pub use engine_store_ffi::{ + interfaces::root::DB as ffi_interfaces, BaseBuffView, CppStrWithView, EngineStoreServerHelper, + PageAndCppStrWithView, RaftStoreProxyFFIHelper, RawCppPtr, RawCppPtrCarr, RawVoidPtr, +}; + +use crate::{ + create_cpp_str, create_cpp_str_parts, + mock_store::{into_engine_store_server_wrap, RawCppPtrTypeImpl}, +}; + +pub enum MockPSSingleWrite { + Put((Vec, MockPSUniversalPage)), + Delete(Vec), +} + +pub struct MockPSWriteBatch { + pub data: Vec<(u64, MockPSSingleWrite)>, + core: Arc>, +} + +impl MockPSWriteBatch { + fn new(core: Arc>) -> Self { + Self { + data: Default::default(), + core, + } + } +} + +pub struct MockPSUniversalPage { + data: Vec, +} + +impl From for MockPSUniversalPage { + fn from(val: BaseBuffView) -> Self { + MockPSUniversalPage { + data: val.to_slice().to_owned(), + } + } +} + +pub struct MockPageStorageCore { + current_id: AtomicU64, +} + +impl MockPageStorageCore { + pub fn alloc_id(&mut self) -> u64 { + self.current_id + .fetch_add(1, std::sync::atomic::Ordering::SeqCst) + } +} + +impl Default for MockPageStorageCore { + fn default() -> Self { + Self { + current_id: AtomicU64::new(1), + } + } +} + +#[derive(Default)] +pub struct MockPageStorage { + pub data: RwLock, MockPSUniversalPage>>, + pub core: Arc>, +} + +pub unsafe extern "C" fn ffi_mockps_create_write_batch( + wrap: *const ffi_interfaces::EngineStoreServerWrap, +) -> RawCppPtr { + let store = into_engine_store_server_wrap(wrap); + let core = (*store.engine_store_server).page_storage.core.clone(); + let ptr = Box::into_raw(Box::new(MockPSWriteBatch::new(core))); + RawCppPtr { + ptr: ptr as RawVoidPtr, + type_: RawCppPtrTypeImpl::PSWriteBatch.into(), + } +} + +impl From for &mut MockPSWriteBatch { + fn from(value: RawVoidPtr) -> Self { + unsafe { &mut *(value as *mut MockPSWriteBatch) } + } +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_put_page( + wb: RawVoidPtr, + page_id: BaseBuffView, + page: BaseBuffView, +) { + let wb: &mut MockPSWriteBatch = <&mut MockPSWriteBatch as From>::from(wb); + let wid = wb.core.write().unwrap().alloc_id(); + let write = MockPSSingleWrite::Put((page_id.to_slice().to_vec(), page.into())); + wb.data.push((wid, write)); +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_del_page(wb: RawVoidPtr, page_id: BaseBuffView) { + let wb: &mut MockPSWriteBatch = <&mut MockPSWriteBatch as From>::from(wb); + let wid = wb.core.write().unwrap().alloc_id(); + let write = MockPSSingleWrite::Delete(page_id.to_slice().to_vec()); + wb.data.push((wid, write)); +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_size(wb: RawVoidPtr) -> u64 { + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + wb.data.len() as u64 +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_is_empty(wb: RawVoidPtr) -> u8 { + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + u8::from(wb.data.is_empty()) +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_merge(lwb: RawVoidPtr, rwb: RawVoidPtr) { + let lwb: _ = <&mut MockPSWriteBatch as From>::from(lwb); + let rwb: _ = <&mut MockPSWriteBatch as From>::from(rwb); + lwb.data.append(&mut rwb.data); +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_clear(wb: RawVoidPtr) { + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + wb.data.clear(); +} + +pub unsafe extern "C" fn ffi_mockps_consume_write_batch( + wrap: *const ffi_interfaces::EngineStoreServerWrap, + wb: RawVoidPtr, +) { + let store = into_engine_store_server_wrap(wrap); + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + let mut guard = (*store.engine_store_server) + .page_storage + .data + .write() + .unwrap(); + wb.data.sort_by_key(|k| k.0); + for (_, write) in wb.data.drain(..) { + match write { + MockPSSingleWrite::Put(w) => { + guard.insert(w.0, w.1); + } + MockPSSingleWrite::Delete(w) => { + guard.remove(&w); + } + } + } +} + +pub unsafe extern "C" fn ffi_mockps_handle_read_page( + wrap: *const ffi_interfaces::EngineStoreServerWrap, + page_id: BaseBuffView, +) -> CppStrWithView { + let store = into_engine_store_server_wrap(wrap); + let guard = (*store.engine_store_server) + .page_storage + .data + .read() + .unwrap(); + let key = page_id.to_slice().to_vec(); + match guard.get(&key) { + Some(p) => create_cpp_str(Some(p.data.clone())), + None => create_cpp_str(None), + } +} + +pub unsafe extern "C" fn ffi_mockps_handle_scan_page( + wrap: *const ffi_interfaces::EngineStoreServerWrap, + start_page_id: BaseBuffView, + end_page_id: BaseBuffView, +) -> RawCppPtrCarr { + let store = into_engine_store_server_wrap(wrap); + let guard = (*store.engine_store_server) + .page_storage + .data + .read() + .unwrap(); + let range = guard.range(( + Included(start_page_id.to_slice().to_vec()), + Excluded(end_page_id.to_slice().to_vec()), + )); + let range = range.collect::>(); + let mut result: Vec = Vec::with_capacity(range.len()); + for (k, v) in range.into_iter() { + let (page, page_view) = create_cpp_str_parts(Some(v.data.clone())); + let (key, key_view) = create_cpp_str_parts(Some(k.clone())); + let pacwv = PageAndCppStrWithView { + page, + key, + page_view, + key_view, + }; + result.push(pacwv) + } + let (result_ptr, l, c) = result.into_raw_parts(); + assert_eq!(l, c); + RawCppPtrCarr { + inner: result_ptr as RawVoidPtr, + len: c as u64, + type_: RawCppPtrTypeImpl::PSPageAndCppStr.into(), + } +} + +pub unsafe extern "C" fn ffi_mockps_handle_purge_pagestorage( + _wrap: *const ffi_interfaces::EngineStoreServerWrap, +) { + // TODO +} + +pub unsafe extern "C" fn ffi_mockps_handle_seek_ps_key( + wrap: *const ffi_interfaces::EngineStoreServerWrap, + page_id: BaseBuffView, +) -> CppStrWithView { + // Find the first great or equal than + let store = into_engine_store_server_wrap(wrap); + let guard = (*store.engine_store_server) + .page_storage + .data + .read() + .unwrap(); + let mut range = guard.range((Included(page_id.to_slice().to_vec()), Unbounded)); + let kv = range.next().unwrap(); + create_cpp_str(Some(kv.0.clone())) +} + +pub unsafe extern "C" fn ffi_mockps_ps_is_empty( + wrap: *const ffi_interfaces::EngineStoreServerWrap, +) -> u8 { + let store = into_engine_store_server_wrap(wrap); + let guard = (*store.engine_store_server) + .page_storage + .data + .read() + .unwrap(); + u8::from(guard.is_empty()) +} diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 3e2b283bdbf..ec154b8e269 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -2,7 +2,7 @@ pub use std::{ cell::RefCell, - collections::{BTreeMap, HashMap, HashSet}, + collections::BTreeMap, pin::Pin, sync::{ atomic::{AtomicU64, Ordering}, @@ -11,27 +11,34 @@ pub use std::{ time::Duration, }; +use assert_type_eq; +use collections::{HashMap, HashSet}; pub use engine_store_ffi::{ interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, RaftStoreProxyFFIHelper, - RawCppPtr, UnwrapExternCFunc, + RawCppPtr, RawVoidPtr, UnwrapExternCFunc, }; +use engine_traits::RaftEngineReadOnly; pub use engine_traits::{ Engines, Iterable, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, SyncMutable, WriteBatch, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; +use int_enum::IntEnum; pub use kvproto::{ raft_cmdpb::AdminCmdType, - raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; pub use protobuf::Message; pub use tikv_util::{box_err, box_try, debug, error, info, warn}; +use crate::node::NodeCluster; pub use crate::{ config::MockConfig, - mock_cluster, + copy_data_from, copy_meta_from, general_get_apply_state, general_get_region_local_state, + get_apply_state, get_raft_local_state, get_region_local_state, mock_cluster, mock_cluster::{ must_get_equal, must_get_none, Cluster, ProxyConfig, Simulator, TestPdClient, TiFlashEngine, }, + mock_page_storage::*, server::ServerCluster, }; @@ -73,6 +80,7 @@ impl Region { #[derive(Default)] pub struct RegionStats { pub pre_handle_count: AtomicU64, + pub fast_add_peer_count: AtomicU64, } pub struct EngineStoreServer { @@ -82,6 +90,7 @@ pub struct EngineStoreServer { pub proxy_compat: bool, pub mock_cfg: MockConfig, pub region_states: RefCell>, + pub page_storage: MockPageStorage, } impl EngineStoreServer { @@ -96,6 +105,7 @@ impl EngineStoreServer { proxy_compat: false, mock_cfg: MockConfig::default(), region_states: RefCell::new(Default::default()), + page_storage: Default::default(), } } @@ -153,6 +163,12 @@ impl EngineStoreServer { } } } + + pub unsafe fn write_to_db_by_region_id(&mut self, region_id: u64, reason: String) { + let kv = &mut self.engines.as_mut().unwrap().kv; + let region = self.kvstore.get_mut(®ion_id).unwrap(); + write_to_db_data_by_engine(self.id, kv, region, reason) + } } pub struct EngineStoreServerWrap { @@ -217,23 +233,10 @@ fn delete_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8]) { data.remove(k); } -unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { +unsafe fn load_data_from_db(store: &mut EngineStoreServer, region_id: u64) { let store_id = store.id; let engine = &mut store.engines.as_mut().unwrap().kv; - let apply_state: RaftApplyState = engine - .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) - .unwrap() - .unwrap(); - let region_state: RegionLocalState = engine - .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) - .unwrap() - .unwrap(); - let region = store.kvstore.get_mut(®ion_id).unwrap(); - region.apply_state = apply_state; - region.region = region_state.get_region().clone(); - set_new_region_peer(region, store.id); - for cf in 0..3 { let cf_name = cf_to_name(cf.into()); region.data[cf].clear(); @@ -262,17 +265,39 @@ unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { } } +unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { + let engine = &mut store.engines.as_mut().unwrap().kv; + let apply_state: RaftApplyState = general_get_apply_state(engine, region_id).unwrap(); + let region_state: RegionLocalState = general_get_region_local_state(engine, region_id).unwrap(); + + let region = store.kvstore.get_mut(®ion_id).unwrap(); + region.apply_state = apply_state; + region.region = region_state.get_region().clone(); + set_new_region_peer(region, store.id); + + load_data_from_db(store, region_id); +} + unsafe fn write_to_db_data( store: &mut EngineStoreServer, region: &mut Box, reason: String, +) { + let kv = &mut store.engines.as_mut().unwrap().kv; + write_to_db_data_by_engine(store.id, kv, region, reason) +} + +unsafe fn write_to_db_data_by_engine( + store_id: u64, + kv: &TiFlashEngine, + region: &mut Box, + reason: String, ) { info!("mock flush to engine"; "region" => ?region.region, - "store_id" => store.id, + "store_id" => store_id, "reason" => reason ); - let kv = &mut store.engines.as_mut().unwrap().kv; for cf in 0..3 { let pending_write = std::mem::take(region.pending_write.as_mut().get_mut(cf).unwrap()); let mut pending_remove = @@ -405,6 +430,7 @@ impl EngineStoreServerWrap { .insert(region_meta.id, Box::new(new_region)); } } + { // Move data let region_ids = @@ -447,6 +473,7 @@ impl EngineStoreServerWrap { { let target_region = &mut (engine_store_server.kvstore.get_mut(®ion_id).unwrap()); + let target_region_meta = &mut target_region.region; let target_version = target_region_meta.get_region_epoch().get_version(); @@ -696,14 +723,30 @@ pub fn gen_engine_store_server_helper( fn_handle_http_request: None, fn_check_http_uri_available: None, fn_gc_raw_cpp_ptr: Some(ffi_gc_raw_cpp_ptr), + fn_gc_raw_cpp_ptr_carr: Some(ffi_gc_raw_cpp_ptr_carr), + fn_gc_special_raw_cpp_ptr: Some(ffi_gc_special_raw_cpp_ptr), fn_get_config: None, fn_set_store: None, fn_set_pb_msg_by_bytes: Some(ffi_set_pb_msg_by_bytes), fn_handle_safe_ts_update: Some(ffi_handle_safe_ts_update), + fn_fast_add_peer: Some(ffi_fast_add_peer), + fn_create_write_batch: Some(ffi_mockps_create_write_batch), + fn_write_batch_put_page: Some(ffi_mockps_write_batch_put_page), + fn_write_batch_del_page: Some(ffi_mockps_write_batch_del_page), + fn_write_batch_size: Some(ffi_mockps_write_batch_size), + fn_write_batch_is_empty: Some(ffi_mockps_write_batch_is_empty), + fn_write_batch_merge: Some(ffi_mockps_write_batch_merge), + fn_write_batch_clear: Some(ffi_mockps_write_batch_clear), + fn_consume_write_batch: Some(ffi_mockps_consume_write_batch), + fn_handle_read_page: Some(ffi_mockps_handle_read_page), + fn_handle_purge_pagestorage: Some(ffi_mockps_handle_purge_pagestorage), + fn_handle_scan_page: Some(ffi_mockps_handle_scan_page), + fn_handle_seek_ps_key: Some(ffi_mockps_handle_seek_ps_key), + fn_ps_is_empty: Some(ffi_mockps_ps_is_empty), } } -unsafe fn into_engine_store_server_wrap( +pub unsafe fn into_engine_store_server_wrap( arg1: *const ffi_interfaces::EngineStoreServerWrap, ) -> &'static mut EngineStoreServerWrap { &mut *(arg1 as *mut EngineStoreServerWrap) @@ -732,36 +775,31 @@ unsafe extern "C" fn ffi_handle_write_raft_cmd( store.handle_write_raft_cmd(arg2, arg3) } -enum RawCppPtrTypeImpl { +#[repr(u32)] +#[derive(IntEnum, Copy, Clone)] +pub enum RawCppPtrTypeImpl { None = 0, - String, - PreHandledSnapshotWithBlock, - WakerNotifier, + String = 1, + PreHandledSnapshotWithBlock = 11, + WakerNotifier = 12, + PSWriteBatch = 13, + PSUniversalPage = 14, + PSPageAndCppStr = 15, } -// TODO -#[allow(clippy::from_over_into)] -impl From for RawCppPtrTypeImpl { - fn from(o: ffi_interfaces::RawCppPtrType) -> Self { - match o { - 0 => RawCppPtrTypeImpl::None, - 1 => RawCppPtrTypeImpl::String, - 2 => RawCppPtrTypeImpl::PreHandledSnapshotWithBlock, - 3 => RawCppPtrTypeImpl::WakerNotifier, - _ => unreachable!(), - } +impl From for ffi_interfaces::RawCppPtrType { + fn from(value: RawCppPtrTypeImpl) -> Self { + assert_type_eq::assert_type_eq!(ffi_interfaces::RawCppPtrType, u32); + value.int_value() } } -// TODO remove this warn. -#[allow(clippy::from_over_into)] -impl Into for RawCppPtrTypeImpl { - fn into(self) -> ffi_interfaces::RawCppPtrType { - match self { - RawCppPtrTypeImpl::None => 0, - RawCppPtrTypeImpl::String => 1, - RawCppPtrTypeImpl::PreHandledSnapshotWithBlock => 2, - RawCppPtrTypeImpl::WakerNotifier => 3, +impl From for RawCppPtrTypeImpl { + fn from(value: ffi_interfaces::RawCppPtrType) -> Self { + if let Ok(s) = RawCppPtrTypeImpl::from_int(value) { + s + } else { + panic!("unknown RawCppPtrType {:?}", value); } } } @@ -917,11 +955,35 @@ impl ProxyNotifier { } } +extern "C" fn ffi_gc_special_raw_cpp_ptr( + ptr: ffi_interfaces::RawVoidPtr, + hint_len: u64, + tp: ffi_interfaces::SpecialCppPtrType, +) { + match tp { + ffi_interfaces::SpecialCppPtrType::None => (), + ffi_interfaces::SpecialCppPtrType::TupleOfRawCppPtr => unsafe { + let p = Box::from_raw(std::slice::from_raw_parts_mut( + ptr as *mut RawCppPtr, + hint_len as usize, + )); + drop(p); + }, + ffi_interfaces::SpecialCppPtrType::ArrayOfRawCppPtr => unsafe { + let p = Box::from_raw(std::slice::from_raw_parts_mut( + ptr as *mut RawVoidPtr, + hint_len as usize, + )); + drop(p); + }, + } +} + extern "C" fn ffi_gc_raw_cpp_ptr( ptr: ffi_interfaces::RawVoidPtr, tp: ffi_interfaces::RawCppPtrType, ) { - match RawCppPtrTypeImpl::from(tp) { + match tp.into() { RawCppPtrTypeImpl::None => {} RawCppPtrTypeImpl::String => unsafe { drop(Box::>::from_raw(ptr as *mut _)); @@ -932,6 +994,43 @@ extern "C" fn ffi_gc_raw_cpp_ptr( RawCppPtrTypeImpl::WakerNotifier => unsafe { drop(Box::from_raw(ptr as *mut ProxyNotifier)); }, + RawCppPtrTypeImpl::PSWriteBatch => unsafe { + drop(Box::from_raw(ptr as *mut MockPSWriteBatch)); + }, + RawCppPtrTypeImpl::PSUniversalPage => unsafe { + drop(Box::from_raw(ptr as *mut MockPSUniversalPage)); + }, + _ => todo!(), + } +} + +extern "C" fn ffi_gc_raw_cpp_ptr_carr( + ptr: ffi_interfaces::RawVoidPtr, + tp: ffi_interfaces::RawCppPtrType, + len: u64, +) { + match tp.into() { + RawCppPtrTypeImpl::String => unsafe { + let p = Box::from_raw(std::slice::from_raw_parts_mut( + ptr as *mut RawVoidPtr, + len as usize, + )); + for i in 0..len { + let i = i as usize; + if !p[i].is_null() { + ffi_gc_raw_cpp_ptr(p[i], RawCppPtrTypeImpl::String.into()); + } + } + drop(p); + }, + RawCppPtrTypeImpl::PSPageAndCppStr => unsafe { + let p = Box::from_raw(std::slice::from_raw_parts_mut( + ptr as *mut PageAndCppStrWithView, + len as usize, + )); + drop(p) + }, + _ => todo!(), } } @@ -1233,6 +1332,252 @@ unsafe extern "C" fn ffi_handle_compute_store_stats( } } +pub unsafe fn create_cpp_str_parts( + s: Option>, +) -> (ffi_interfaces::RawCppPtr, ffi_interfaces::BaseBuffView) { + match s { + Some(s) => { + let len = s.len() as u64; + let ptr = Box::into_raw(Box::new(s)); // leak + ( + ffi_interfaces::RawCppPtr { + ptr: ptr as RawVoidPtr, + type_: RawCppPtrTypeImpl::String.into(), + }, + ffi_interfaces::BaseBuffView { + data: (*ptr).as_ptr() as *const _, + len, + }, + ) + } + None => ( + ffi_interfaces::RawCppPtr { + ptr: std::ptr::null_mut(), + type_: RawCppPtrTypeImpl::None.into(), + }, + ffi_interfaces::BaseBuffView { + data: std::ptr::null(), + len: 0, + }, + ), + } +} + +pub unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { + let (p, v) = create_cpp_str_parts(s); + ffi_interfaces::CppStrWithView { inner: p, view: v } +} + +#[allow(clippy::redundant_closure_call)] +unsafe extern "C" fn ffi_fast_add_peer( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + region_id: u64, + new_peer_id: u64, +) -> ffi_interfaces::FastAddPeerRes { + let store = into_engine_store_server_wrap(arg1); + let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); + let store_id = (*store.engine_store_server).id; + (*store.engine_store_server).mutate_region_states(region_id, |e: &mut RegionStats| { + e.fast_add_peer_count.fetch_add(1, Ordering::SeqCst); + }); + + let failed_add_peer_res = + |status: ffi_interfaces::FastAddPeerStatus| ffi_interfaces::FastAddPeerRes { + status, + apply_state: create_cpp_str(None), + region: create_cpp_str(None), + }; + let from_store = (|| { + fail::fail_point!("ffi_fast_add_peer_from_id", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 1 + })(); + let block_wait: bool = (|| { + fail::fail_point!("ffi_fast_add_peer_block_wait", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })() != 0; + let fail_after_write: bool = (|| { + fail::fail_point!("ffi_fast_add_peer_fail_after_write", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })() != 0; + debug!("recover from remote peer: enter from {} to {}", from_store, store_id; "region_id" => region_id); + + for retry in 0..300 { + let mut ret: Option = None; + if retry > 0 { + std::thread::sleep(std::time::Duration::from_millis(30)); + } + cluster.access_ffi_helpers(&mut |guard: &mut HashMap| { + debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); + let source_server = match guard.get_mut(&from_store) { + Some(s) => &mut s.engine_store_server, + None => { + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::NoSuitable)); + return; + } + }; + let source_engines = match source_server.engines.clone() { + Some(s) => s, + None => { + error!("recover from remote peer: failed get source engine"; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return + } + }; + // TODO We must ask the remote peer to persist before get a snapshot. + let source_region = match source_server.kvstore.get(®ion_id) { + Some(s) => s, + None => { + error!("recover from remote peer: failed read source region info"; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return; + } + }; + let region_local_state: RegionLocalState = match general_get_region_local_state( + &source_engines.kv, + region_id, + ) { + Some(x) => x, + None => { + debug!("recover from remote peer: preparing from {} to {}:{}, not region state", from_store, store_id, new_peer_id; "region_id" => region_id); + // We don't return BadData here, since the data may not be persisted. + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData)); + return; + } + }; + let new_region_meta = region_local_state.get_region(); + let peer_state = region_local_state.get_state(); + // Validation + match peer_state { + PeerState::Tombstone | PeerState::Applying => { + // Note in real implementation, we will avoid selecting this peer. + error!("recover from remote peer: preparing from {} to {}:{}, error peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return; + } + _ => { + info!("recover from remote peer: preparing from {} to {}:{}, ok peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); + } + }; + if !engine_store_ffi::observer::validate_remote_peer_region( + new_region_meta, + store_id, + new_peer_id, + ) { + debug!("recover from remote peer: preparing from {} to {}, not applied conf change {}", from_store, store_id, new_peer_id; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData)); + return; + } + // TODO check commit_index and applied_index here + debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); + let new_region = make_new_region( + Some(new_region_meta.clone()), + Some((*store.engine_store_server).id), + ); + (*store.engine_store_server) + .kvstore + .insert(region_id, Box::new(new_region)); + let target_engines = match (*store.engine_store_server).engines.clone() { + Some(s) => s, + None => { + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError)); + return; + } + }; + let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { + Some(s) => s, + None => { + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return; + } + }; + debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); + // Must first dump meta then data, otherwise data may lag behind. + // We can see a raft log hole at applied_index otherwise. + let apply_state: RaftApplyState = match general_get_apply_state( + &source_engines.kv, + region_id, + ) { + Some(x) => x, + None => { + error!("recover from remote peer: failed read apply state"; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return; + } + }; + debug!("recover from remote peer: begin data from {} to {}", from_store, store_id; + "region_id" => region_id, + "apply_state" => ?apply_state, + ); + // TODO In TiFlash we should take care of write batch size + if let Err(e) = copy_data_from( + &source_engines, + &target_engines, + &source_region, + target_region, + ) { + error!("recover from remote peer: inject error {:?}", e; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject)); + return; + } + if fail_after_write { + let mut raft_wb = target_engines.raft.log_batch(1024); + let mut entries: Vec = Default::default(); + target_engines + .raft + .get_all_entries_to(region_id, &mut entries) + .unwrap(); + let l = entries.len(); + // Manually delete one raft log + // let from = entries.get(l - 2).unwrap().get_index(); + let from = 7; + let to = entries.get(l - 1).unwrap().get_index() + 1; + debug!("recover from remote peer: simulate error from {} to {}", from_store, store_id; + "region_id" => region_id, + "from" => from, + "to" => to, + ); + // raft_wb.cut_logs(region_id, from, to); + target_engines.raft.gc(region_id, from, to, &mut raft_wb).unwrap(); + target_engines.raft.consume(&mut raft_wb, true).unwrap(); + } + let apply_state_bytes = apply_state.write_to_bytes().unwrap(); + let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); + let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); + let region_ptr = create_cpp_str(Some(region_bytes)); + // Check if we have commit_index. + debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); + ret = Some(ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::Ok, + apply_state: apply_state_ptr, + region: region_ptr, + }); + }); + if let Some(r) = ret { + match r.status { + ffi_interfaces::FastAddPeerStatus::WaitForData => { + if block_wait { + continue; + } else { + return r; + } + } + _ => return r, + } + } + } + error!("recover from remote peer: failed after retry"; "region_id" => region_id); + failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData) +} + #[allow(clippy::single_element_loop)] pub fn move_data_from( engine_store_server: &mut EngineStoreServer, diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index 954050a7f2c..23d17626e98 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -82,16 +82,27 @@ impl Default for ChannelTransport { impl Transport for ChannelTransport { #[allow(clippy::significant_drop_in_scrutinee)] + #[allow(clippy::redundant_closure_call)] fn send(&mut self, msg: RaftMessage) -> Result<()> { - let from_store = msg.get_from_peer().get_store_id(); + let mut from_store = msg.get_from_peer().get_store_id(); let to_store = msg.get_to_peer().get_store_id(); let to_peer_id = msg.get_to_peer().get_id(); let region_id = msg.get_region_id(); let is_snapshot = msg.get_message().get_msg_type() == MessageType::MsgSnapshot; if is_snapshot { + let fake_self_snapshot = (|| { + fail::fail_point!("fast_add_peer_fake_snapshot", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })(); let snap = msg.get_message().get_snapshot(); let key = SnapKey::from_snap(snap).unwrap(); + if fake_self_snapshot == 1 { + from_store = to_store; + } let from = match self.core.lock().unwrap().snap_paths.get(&from_store) { Some(p) => { p.0.register(key.clone(), SnapEntry::Sending); @@ -99,6 +110,9 @@ impl Transport for ChannelTransport { } None => return Err(box_err!("missing temp dir for store {}", from_store)), }; + if fake_self_snapshot == 1 && !from.exists() { + panic!("non-exist snapshot"); + } let to = match self.core.lock().unwrap().snap_paths.get(&to_store) { Some(p) => { p.0.register(key.clone(), SnapEntry::Receiving); @@ -138,11 +152,14 @@ impl Transport for ChannelTransport { h.send_raft_msg(msg)?; if is_snapshot { // should report snapshot finish. - let _ = core.routers[&from_store].report_snapshot_status( - region_id, - to_peer_id, - SnapshotStatus::Finish, - ); + match core.routers.get(&from_store) { + Some(router) => router.report_snapshot_status( + region_id, + to_peer_id, + SnapshotStatus::Finish, + ), + None => return Err(box_err!("Find no from_store {}", from_store)), + }?; } Ok(()) } @@ -299,6 +316,7 @@ impl Simulator for NodeCluster { (snap_mgr.clone(), None) }; + debug!("snapshot_mgr path of {} is {:?}", node_id, snap_mgr_path); self.snap_mgrs.insert(node_id, snap_mgr.clone()); let importer = { @@ -314,11 +332,19 @@ impl Simulator for NodeCluster { f(node_id, &mut coprocessor_host); } + let packed_envs = engine_store_ffi::observer::PackedEnvs { + engine_store_cfg: cfg.proxy_cfg.engine_store.clone(), + pd_endpoints: cfg.pd.endpoints.clone(), + }; let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( node_id, engines.kv.clone(), + engines.raft.clone(), importer.clone(), cfg.proxy_cfg.raft_store.snap_handle_pool_size, + simulate_trans.clone(), + snap_mgr.clone(), + packed_envs, ); tiflash_ob.register_to(&mut coprocessor_host); diff --git a/new-mock-engine-store/src/server.rs b/new-mock-engine-store/src/server.rs index 3d0c78cf894..9b80a7d4cad 100644 --- a/new-mock-engine-store/src/server.rs +++ b/new-mock-engine-store/src/server.rs @@ -389,14 +389,6 @@ impl ServerCluster { Arc::clone(&importer), ); - let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( - node_id, - engines.kv.clone(), - importer.clone(), - 2, - ); - tiflash_ob.register_to(&mut coprocessor_host); - let check_leader_runner = CheckLeaderRunner::new(store_meta.clone(), coprocessor_host.clone()); let check_leader_scheduler = bg_worker.start("check-leader", check_leader_runner); @@ -426,6 +418,7 @@ impl ServerCluster { quota_limiter.clone(), self.pd_client.feature_gate().clone(), None, + None, // TODO resource_ctl )?; self.storages.insert(node_id, raft_engine); @@ -543,6 +536,22 @@ impl ServerCluster { let max_grpc_thread_count = cfg.server.grpc_concurrency; let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + let packed_envs = engine_store_ffi::observer::PackedEnvs { + engine_store_cfg: cfg.proxy_cfg.engine_store.clone(), + pd_endpoints: cfg.pd.endpoints.clone(), + }; + let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( + node_id, + engines.kv.clone(), + engines.raft.clone(), + importer.clone(), + cfg.proxy_cfg.raft_store.snap_handle_pool_size, + simulate_trans.clone(), + snap_mgr.clone(), + packed_envs, + ); + tiflash_ob.register_to(&mut coprocessor_host); + // Register the role change observer of the lock manager. lock_mgr.register_detector_role_change_observer(&mut coprocessor_host); diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index 70dbfdfa1f6..a6d432a1fd7 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -1,8 +1,9 @@ set -uxeo pipefail if [[ $M == "fmt" ]]; then make gen_proxy_ffi + git status -s GIT_STATUS=$(git status -s) && if [[ ${GIT_STATUS} ]]; then echo "Error: found illegal git status"; echo ${GIT_STATUS}; [[ -z ${GIT_STATUS} ]]; fi - cargo fmt -- --check >/dev/null + cargo fmt -- --check elif [[ $M == "testold" ]]; then export ENGINE_LABEL_VALUE=tiflash export RUST_BACKTRACE=full @@ -43,6 +44,9 @@ elif [[ $M == "testnew" ]]; then cargo test --package proxy_tests --test proxy region cargo test --package proxy_tests --test proxy flashback cargo test --package proxy_tests --test proxy server_cluster_test + cargo test --package proxy_tests --test proxy fast_add_peer + cargo test --package proxy_tests --test proxy ffi -- --test-threads 1 + cargo test --package proxy_tests --test proxy write --features="proxy_tests/enable-pagestorage" elif [[ $M == "debug" ]]; then # export RUSTC_WRAPPER=~/.cargo/bin/sccache export ENGINE_LABEL_VALUE=tiflash diff --git a/proxy_server/Cargo.toml b/proxy_server/Cargo.toml index b4c42af2cd1..69fb8df002d 100644 --- a/proxy_server/Cargo.toml +++ b/proxy_server/Cargo.toml @@ -34,6 +34,7 @@ nortcheck = ["engine_rocks/nortcheck"] backup-stream-debug = ["backup-stream/backup-stream-debug"] pprof-fp = ["tikv/pprof-fp"] +enable-pagestorage = ["engine_tiflash/enable-pagestorage", "engine_store_ffi/enable-pagestorage"] [dependencies] api_version = { workspace = true } @@ -51,6 +52,7 @@ encryption_export = { workspace = true, default-features = false } engine_rocks = { workspace = true, default-features = false } engine_rocks_helper = { workspace = true } engine_store_ffi = { workspace = true, default-features = false } +engine_tiflash = { workspace = true, default-features = false } engine_traits = { workspace = true, default-features = false } error_code = { workspace = true, default-features = false } fail = "0.5" @@ -84,6 +86,7 @@ raftstore = { workspace = true, default-features = false } rand = "0.8" regex = "1.3" resolved_ts = { workspace = true, default-features = false } +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true, default-features = false } serde = "1.0" diff --git a/proxy_server/src/config.rs b/proxy_server/src/config.rs index f8e93d85a88..106d3bfcb19 100644 --- a/proxy_server/src/config.rs +++ b/proxy_server/src/config.rs @@ -2,6 +2,7 @@ use std::{collections::HashSet, iter::FromIterator, path::Path}; +use engine_store_ffi::EngineStoreConfig; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use itertools::Itertools; use online_config::OnlineConfig; @@ -264,6 +265,9 @@ pub struct ProxyConfig { #[online_config(skip)] pub import: ImportConfig, + + #[online_config(skip)] + pub engine_store: EngineStoreConfig, } /// We use custom default, in case of later non-ordinary config items. @@ -280,6 +284,7 @@ impl Default for ProxyConfig { memory_usage_high_water: 0.1, readpool: ReadPoolConfig::default(), import: ImportConfig::default(), + engine_store: EngineStoreConfig::default(), } } } diff --git a/proxy_server/src/lib.rs b/proxy_server/src/lib.rs index ee4ecbf2a9c..073ed75024a 100644 --- a/proxy_server/src/lib.rs +++ b/proxy_server/src/lib.rs @@ -35,7 +35,8 @@ fn proxy_version_info() -> String { \nRust Version: {}\ \nStorage Engine: {}\ \nPrometheus Prefix: {}\ - \nProfile: {}", + \nProfile: {}\ + \nEnable Features: {}", option_env!("PROXY_BUILD_GIT_HASH").unwrap_or(fallback), option_env!("PROXY_BUILD_GIT_BRANCH").unwrap_or(fallback), option_env!("PROXY_BUILD_TIME").unwrap_or(fallback), @@ -43,6 +44,7 @@ fn proxy_version_info() -> String { option_env!("ENGINE_LABEL_VALUE").unwrap_or(fallback), option_env!("PROMETHEUS_METRIC_NAME_PREFIX").unwrap_or(fallback), option_env!("PROXY_PROFILE").unwrap_or(fallback), + option_env!("ENABLE_FEATURES").unwrap_or(fallback), ) } diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 7453b0a6034..9d366c966e9 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -27,8 +27,8 @@ use engine_rocks::{ }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_store_ffi::{ - self, EngineStoreServerHelper, EngineStoreServerStatus, RaftProxyStatus, RaftStoreProxy, - RaftStoreProxyFFI, RaftStoreProxyFFIHelper, ReadIndexClient, TiFlashEngine, + self, ps_engine::PSEngine, EngineStoreServerHelper, EngineStoreServerStatus, RaftProxyStatus, + RaftStoreProxy, RaftStoreProxyFFI, RaftStoreProxyFFIHelper, ReadIndexClient, TiFlashEngine, }; use engine_traits::{ CachedTablet, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, @@ -62,6 +62,9 @@ use raftstore::{ SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, }; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; use security::SecurityManager; use server::{memory::*, raft_engine_switch::*}; use tikv::{ @@ -334,7 +337,9 @@ pub unsafe fn run_tikv_proxy( engine_store_server_helper, ) } else { - run_impl::(config, proxy_config, engine_store_server_helper) + run_impl::(config, proxy_config, engine_store_server_helper) + // run_impl::(config, proxy_config, + // engine_store_server_helper) } }) } @@ -393,7 +398,7 @@ impl TiKvServer { .unwrap(); // Create raft engine - let (raft_engine, raft_statistics) = CER::build( + let (mut raft_engine, raft_statistics) = CER::build( &self.config, &env, &self.encryption_key_manager, @@ -401,6 +406,13 @@ impl TiKvServer { ); self.raft_statistics = raft_statistics; + match raft_engine.as_ps_engine() { + None => {} + Some(ps_engine) => { + ps_engine.init(engine_store_server_helper); + } + } + // Create kv engine. let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) // TODO(tiflash) check if we need a old version of RocksEngine, or if we need to upgrade @@ -416,15 +428,19 @@ impl TiKvServer { self.kv_statistics = Some(factory.rocks_statistics()); let helper = engine_store_ffi::gen_engine_store_server_helper(engine_store_server_helper); - let ffi_hub = Arc::new(engine_store_ffi::observer::TiFlashFFIHub { + let ffi_hub = Arc::new(engine_store_ffi::TiFlashFFIHub { engine_store_server_helper: helper, }); // engine_tiflash::RocksEngine has engine_rocks::RocksEngine inside let mut kv_engine = TiFlashEngine::from_rocks(kv_engine); + let proxy_config_set = Arc::new(engine_tiflash::ProxyConfigSet { + engine_store: self.proxy_config.engine_store.clone(), + }); kv_engine.init( engine_store_server_helper, self.proxy_config.raft_store.snap_handle_pool_size, Some(ffi_hub), + Some(proxy_config_set), ); let engines = Engines::new(kv_engine.clone(), raft_engine); @@ -491,6 +507,7 @@ struct TiKvServer { background_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Option>, tablet_registry: Option>, } @@ -535,19 +552,39 @@ impl TiKvServer { // Initialize and check config info!("using proxy config"; "config" => ?proxy_config); + info!("!!!!! using proxy config 2"; "engine_store" => ?proxy_config.engine_store); + let cfg_controller = Self::init_config(config, &proxy_config); let config = cfg_controller.get_current(); let store_path = Path::new(&config.storage.data_dir).to_owned(); - // Initialize raftstore channels. - let (router, system) = fsm::create_raft_batch_system(&config.raft_store); - let thread_count = config.server.background_thread_count; let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); + // spawn a task to periodically update the minimal virtual time of all resource + // groups. + let resource_mgr = mgr.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; + }); + Some(mgr) + } else { + None + }; + // Initialize raftstore channels. + let (router, system) = fsm::create_raft_batch_system(&config.raft_store, &resource_manager); + let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), config.coprocessor.clone(), @@ -598,6 +635,7 @@ impl TiKvServer { flow_info_receiver: None, sst_worker: None, quota_limiter, + resource_manager, tablet_registry: None, } } @@ -902,10 +940,15 @@ impl TiKvServer { } let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + resource_ctl, )) } else { None @@ -988,8 +1031,12 @@ impl TiKvServer { Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), None, // causal_ts_provider + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); + cfg_controller.register( tikv::config::Module::Storage, Box::new(StorageConfigManger::new( @@ -1180,14 +1227,6 @@ impl TiKvServer { } let importer = Arc::new(importer); - let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( - node.id(), - self.engines.as_ref().unwrap().engines.kv.clone(), - importer.clone(), - self.proxy_config.raft_store.snap_handle_pool_size, - ); - tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); - let check_leader_runner = CheckLeaderRunner::new( engines.store_meta.clone(), self.coprocessor_host.clone().unwrap(), @@ -1221,6 +1260,23 @@ impl TiKvServer { health_service, ) .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); + + let packed_envs = engine_store_ffi::observer::PackedEnvs { + engine_store_cfg: self.proxy_config.engine_store.clone(), + pd_endpoints: self.config.pd.endpoints.clone(), + }; + let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( + node.id(), + self.engines.as_ref().unwrap().engines.kv.clone(), + self.engines.as_ref().unwrap().engines.raft.clone(), + importer.clone(), + self.proxy_config.raft_store.snap_handle_pool_size, + server.transport().clone(), + snap_mgr.clone(), + packed_envs, + ); + tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + cfg_controller.register( tikv::config::Module::Server, Box::new(ServerConfigManager::new( @@ -1623,7 +1679,12 @@ pub trait ConfiguredRaftEngine: RaftEngine { fn as_rocks_engine(&self) -> Option<&RocksEngine> { None } + fn register_config(&self, _cfg_controller: &mut ConfigController) {} + + fn as_ps_engine(&mut self) -> Option<&mut PSEngine> { + None + } } impl ConfiguredRaftEngine for engine_rocks::RocksEngine { @@ -1710,6 +1771,21 @@ impl ConfiguredRaftEngine for RaftLogEngine { } } +impl ConfiguredRaftEngine for PSEngine { + fn build( + _config: &TikvConfig, + _env: &Arc, + _key_manager: &Option>, + _block_cache: &Cache, + ) -> (Self, Option>) { + (PSEngine::new(), None) + } + + fn as_ps_engine(&mut self) -> Option<&mut PSEngine> { + Some(self) + } +} + /// Various sanity-checks and logging before running a server. /// /// Warnings are logged. diff --git a/proxy_tests/Cargo.toml b/proxy_tests/Cargo.toml index e9730c960c5..31e8d93a498 100644 --- a/proxy_tests/Cargo.toml +++ b/proxy_tests/Cargo.toml @@ -37,6 +37,8 @@ mem-profiling = ["tikv/mem-profiling"] sse = ["tikv/sse"] portable = ["tikv/portable"] +enable-pagestorage = ["engine_tiflash/enable-pagestorage", "engine_store_ffi/enable-pagestorage"] + [dependencies] api_version = { workspace = true } async-trait = "0.1" diff --git a/proxy_tests/proxy/config.rs b/proxy_tests/proxy/config.rs index d543f6b97ef..253c21d2eef 100644 --- a/proxy_tests/proxy/config.rs +++ b/proxy_tests/proxy/config.rs @@ -201,3 +201,26 @@ apply-low-priority-pool-size = 41 config.raft_store.apply_batch_system.low_priority_pool_size ); } + +#[test] +fn test_config_proxy_owned_config() { + test_util::init_log_for_test(); + let mut file = tempfile::NamedTempFile::new().unwrap(); + write!( + file, + " +[engine-store] +enable-fast-add-peer = true + " + ) + .unwrap(); + let path = file.path(); + + let mut v: Vec = vec![]; + let cpath = Some(path.as_os_str()); + let proxy_config = gen_proxy_config(&cpath, false, &mut v); + + info!("using proxy config"; "config" => ?proxy_config); + info!("!!!!! using proxy config 2"; "engine_store" => ?proxy_config.engine_store); + assert_eq!(true, proxy_config.engine_store.enable_fast_add_peer); +} diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs new file mode 100644 index 00000000000..c96ce750c3e --- /dev/null +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -0,0 +1,654 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use crate::proxy::*; + +#[derive(PartialEq, Eq)] +enum SourceType { + Leader, + Learner, + DelayedLearner, + InvalidSource, +} + +#[derive(PartialEq, Eq, Debug)] +enum PauseType { + None, + Build, + ApplySnapshot, + SendFakeSnapshot, +} + +#[test] +fn basic_fast_add_peer() { + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 2); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("fast_add_peer_fake_snapshot", "return(1)").unwrap(); + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k0", b"v0"); + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + + cluster.shutdown(); + fail::remove("fallback_to_slow_path_not_allow"); + fail::remove("fast_add_peer_fake_snapshot"); + fail::remove("before_tiflash_check_double_write"); +} + +fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseType) { + // The case in TiFlash is (DelayedPeer, false, Build) + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + if block_wait { + fail::cfg("ffi_fast_add_peer_block_wait", "return(1)").unwrap(); + } + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + // If we don't write here, we will have the first MsgAppend with (6,6), which + // will cause "fast-forwarded commit to snapshot". + cluster.must_put(b"k0", b"v0"); + + // Add learner 2 from leader 1 + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + // std::thread::sleep(std::time::Duration::from_millis(2000)); + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + + // Getting (k1,v1) not necessarily means peer 2 is ready. + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 2).is_some() + }); + + // Add learner 3 according to source_type + match source_type { + SourceType::Learner | SourceType::DelayedLearner => { + fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); + } + SourceType::InvalidSource => { + fail::cfg("ffi_fast_add_peer_from_id", "return(100)").unwrap(); + } + _ => (), + }; + + match pause { + PauseType::Build => fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(), + PauseType::ApplySnapshot => fail::cfg("on_can_apply_snapshot", "return(false)").unwrap(), + PauseType::SendFakeSnapshot => { + fail::cfg("fast_add_peer_fake_send", "return(1)").unwrap(); + // If we fake send snapshot, then fast path will certainly fail. + // Then we will timeout in FALLBACK_MILLIS and go to slow path. + } + _ => (), + } + + // Add peer 3 + pd_client.must_add_peer(1, new_learner_peer(3, 3)); + cluster.must_put(b"k2", b"v2"); + + let need_fallback = if pause == PauseType::SendFakeSnapshot { + true + } else { + false + }; + + // If we need to fallback to slow path, + // we must make sure the data is persisted before Leader generated snapshot. + // This is necessary, since we haven't adapt `handle_snapshot`, + // which is a leader logic. + if need_fallback { + check_key(&cluster, b"k2", b"v2", Some(true), None, Some(vec![1])); + iter_ffi_helpers( + &cluster, + Some(vec![1]), + &mut |_, _, ffi: &mut FFIHelperSet| unsafe { + let server = ffi.engine_store_server.as_mut(); + server.write_to_db_by_region_id(1, "persist for up-to-date snapshot".to_string()); + }, + ); + } + + match source_type { + SourceType::DelayedLearner => { + // Make sure conf change is applied in peer 2. + check_key(&cluster, b"k2", b"v2", Some(true), None, Some(vec![1, 2])); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .msg_type(MessageType::MsgAppend) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Recv), + )); + cluster.must_put(b"k3", b"v3"); + } + _ => (), + }; + + // Wait some time and then recover. + match pause { + PauseType::Build => { + std::thread::sleep(std::time::Duration::from_millis(3000)); + fail::remove("ffi_fast_add_peer_pause"); + } + PauseType::ApplySnapshot => { + std::thread::sleep(std::time::Duration::from_millis(4000)); + fail::remove("on_can_apply_snapshot"); + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + std::thread::sleep(std::time::Duration::from_millis(5000)); + } + PauseType::SendFakeSnapshot => { + // Wait FALLBACK_MILLIS + std::thread::sleep(std::time::Duration::from_millis(5000)); + fail::remove("fast_add_peer_fake_send"); + std::thread::sleep(std::time::Duration::from_millis(2000)); + } + _ => (), + } + + // Check stage 1. + match source_type { + SourceType::DelayedLearner => { + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1, 3])); + check_key(&cluster, b"k3", b"v3", Some(false), None, Some(vec![2])); + } + SourceType::Learner => { + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + } + _ => { + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + } + }; + must_wait_until_cond_node(&cluster, 1, Some(vec![3]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() + }); + + match pause { + PauseType::ApplySnapshot => { + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |_, _, _ffi: &mut FFIHelperSet| { + // Not actually the case, since we allow handling + // MsgAppend multiple times. + // So the following fires when: + // (DelayedLearner, false, ApplySnapshot) + + // let server = &ffi.engine_store_server; + // (*ffi.engine_store_server).mutate_region_states(1, |e: + // &mut RegionStats| { assert_eq!(1, + // e.fast_add_peer_count.load(Ordering::SeqCst)); + // }); + }, + ); + } + _ => (), + } + + match source_type { + SourceType::DelayedLearner => { + cluster.clear_send_filters(); + } + _ => (), + }; + + // Destroy peer, and then try re-add a new peer of the same region. + pd_client.must_remove_peer(1, new_learner_peer(3, 3)); + must_wait_until_cond_node(&cluster, 1, Some(vec![1]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_none() + }); + std::thread::sleep(std::time::Duration::from_millis(1000)); + // Assert the peer removing succeeed. + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + assert!(!server.kvstore.contains_key(&1)); + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + e.fast_add_peer_count.store(0, Ordering::SeqCst); + }); + }, + ); + cluster.must_put(b"k5", b"v5"); + // These failpoints make sure we will cause again a fast path. + if source_type == SourceType::InvalidSource { + // If we still use InvalidSource, we still need to goto slow path. + } else { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + } + // Re-add peer in store. + pd_client.must_add_peer(1, new_learner_peer(3, 4)); + // Wait until Learner has applied ConfChange + std::thread::sleep(std::time::Duration::from_millis(1000)); + must_wait_until_cond_node(&cluster, 1, Some(vec![3]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 4).is_some() + }); + // If we re-add peer, we can still go fast path. + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |_, _, ffi: &mut FFIHelperSet| { + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + assert!(e.fast_add_peer_count.load(Ordering::SeqCst) > 0); + }); + }, + ); + cluster.must_put(b"k6", b"v6"); + check_key( + &cluster, + b"k6", + b"v6", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + fail::remove("fallback_to_slow_path_not_allow"); + fail::remove("fast_path_is_not_first"); + + fail::remove("on_can_apply_snapshot"); + fail::remove("ffi_fast_add_peer_from_id"); + fail::remove("on_pre_persist_with_finish"); + fail::remove("ffi_fast_add_peer_block_wait"); + cluster.shutdown(); +} + +#[test] +fn test_fast_add_peer_from_leader() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Leader, false, PauseType::None); + fail::remove("fallback_to_slow_path_not_allow"); +} + +/// Fast path by learner snapshot. +#[test] +fn test_fast_add_peer_from_learner() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, false, PauseType::None); + fail::remove("fallback_to_slow_path_not_allow"); +} + +/// If a learner is delayed, but already applied ConfChange. +#[test] +fn test_fast_add_peer_from_delayed_learner() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, false, PauseType::None); + fail::remove("fallback_to_slow_path_not_allow"); +} + +/// If we select a wrong source, or we can't run fast path, we can fallback to +/// normal. +#[test] +fn test_fast_add_peer_from_invalid_source() { + simple_fast_add_peer(SourceType::InvalidSource, false, PauseType::None); +} + +#[test] +fn test_fast_add_peer_from_learner_blocked() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, true, PauseType::None); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::None); + fail::remove("fallback_to_slow_path_not_allow"); +} + +// Delay when fetch and build data +#[test] +fn test_fast_add_peer_from_learner_blocked_paused_build() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + fail::cfg("apply_on_handle_snapshot_sync", "return(true)").unwrap(); + simple_fast_add_peer(SourceType::Learner, true, PauseType::Build); + fail::remove("apply_on_handle_snapshot_sync"); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked_paused_build() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + fail::cfg("apply_on_handle_snapshot_sync", "return(true)").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::Build); + fail::remove("apply_on_handle_snapshot_sync"); + fail::remove("fallback_to_slow_path_not_allow"); +} + +// Delay when applying snapshot +// This test is origianlly aimed to test multiple MsgSnapshot. +// However, we observed less repeated MsgAppend than in real cluster. +#[test] +fn test_fast_add_peer_from_learner_blocked_paused_apply() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, true, PauseType::ApplySnapshot); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked_paused_apply() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::ApplySnapshot); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_apply() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, false, PauseType::ApplySnapshot); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_timeout_fallback() { + fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("apply_on_handle_snapshot_sync", "return(true)").unwrap(); + simple_fast_add_peer(SourceType::Learner, false, PauseType::SendFakeSnapshot); + fail::remove("on_pre_persist_with_finish"); + fail::remove("apply_on_handle_snapshot_sync"); +} + +#[test] +fn test_existing_peer() { + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 2); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + must_put_and_check_key(&mut cluster, 1, 2, Some(true), None, Some(vec![1])); + + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + must_put_and_check_key(&mut cluster, 3, 4, Some(true), None, None); + fail::remove("fallback_to_slow_path_not_allow"); + + stop_tiflash_node(&mut cluster, 2); + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + restart_tiflash_node(&mut cluster, 2); + must_put_and_check_key(&mut cluster, 5, 6, Some(true), None, None); + + cluster.shutdown(); + fail::remove("go_fast_path_not_allow"); + fail::remove("before_tiflash_check_double_write"); +} + +// We will reject remote peer in Applying state. +#[test] +fn test_apply_snapshot() { + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + must_put_and_check_key(&mut cluster, 1, 2, Some(true), None, Some(vec![1])); + + // We add peer 3 from peer 2, it will be paused before fetching peer 2's data. + // However, peer 2 will apply conf change. + fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); + fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(3, 3)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + must_put_and_check_key(&mut cluster, 2, 3, Some(true), None, Some(vec![1, 2])); + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() + }); + + // peer 2 can't apply new kvs. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Both), + )); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Both), + )); + cluster.must_put(b"k3", b"v3"); + cluster.must_put(b"k4", b"v4"); + cluster.must_put(b"k5", b"v5"); + // Log compacted, peer 2 will get snapshot, however, we pause when applying + // snapshot. + force_compact_log(&mut cluster, b"k2", Some(vec![1])); + // Wait log compacted. + std::thread::sleep(std::time::Duration::from_millis(1000)); + fail::cfg("on_ob_post_apply_snapshot", "pause").unwrap(); + // Trigger a snapshot to 2. + cluster.clear_send_filters(); + + debug!("wait applying snapshot of peer 2"); + // Wait until peer 2 in Applying state. + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + states.in_disk_region_state.get_state() == PeerState::Applying + }); + + // Now if we continue fast path, peer 2 will be in Applying state. + // Peer 3 can't use peer 2's data. + // We will end up going slow path. + fail::remove("ffi_fast_add_peer_pause"); + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + std::thread::sleep(std::time::Duration::from_millis(300)); + // Resume applying snapshot + fail::remove("on_ob_post_apply_snapshot"); + check_key(&cluster, b"k4", b"v4", Some(true), None, Some(vec![1, 3])); + cluster.shutdown(); + fail::remove("go_fast_path_not_allow"); + fail::remove("ffi_fast_add_peer_from_id"); + fail::remove("before_tiflash_check_double_write"); +} + +#[test] +fn test_split_no_fast_add() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + cluster.cfg.raft_store.right_derive_when_split = true; + + let _ = cluster.run(); + + // Compose split keys + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + check_key(&cluster, b"k1", b"v1", Some(true), None, None); + check_key(&cluster, b"k3", b"v3", Some(true), None, None); + let r1 = cluster.get_region(b"k1"); + let r3 = cluster.get_region(b"k3"); + assert_eq!(r1.get_id(), r3.get_id()); + + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + cluster.must_split(&r1, b"k2"); + must_wait_until_cond_node(&cluster, 1000, None, &|states: &States| -> bool { + states.in_disk_region_state.get_region().get_peers().len() == 3 + }); + let _r1_new = cluster.get_region(b"k1"); // 1000 + let _r3_new = cluster.get_region(b"k3"); // 1 + cluster.must_put(b"k0", b"v0"); + check_key(&cluster, b"k0", b"v0", Some(true), None, None); + + fail::remove("go_fast_path_not_allow"); + fail::remove("on_can_apply_snapshot"); + cluster.shutdown(); +} + +#[test] +fn test_split_merge() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + cluster.cfg.raft_store.right_derive_when_split = true; + + let _ = cluster.run_conf_change(); + + // Compose split keys + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1])); + let r1 = cluster.get_region(b"k1"); + let r3 = cluster.get_region(b"k3"); + assert_eq!(r1.get_id(), r3.get_id()); + + cluster.must_split(&r1, b"k2"); + let r1_new = cluster.get_region(b"k1"); // 1000 + let r3_new = cluster.get_region(b"k3"); // 1 + let r1_id = r1_new.get_id(); + let r3_id = r3_new.get_id(); + debug!("r1_new {} r3_new {}", r1_id, r3_id); + + // Test add peer after split + pd_client.must_add_peer(r1_id, new_learner_peer(2, 2001)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![2])); + check_key(&cluster, b"k3", b"v3", Some(false), None, Some(vec![2])); + pd_client.must_add_peer(r3_id, new_learner_peer(2, 2003)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + check_key(&cluster, b"k1", b"v1", Some(false), None, Some(vec![2])); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![2])); + + // Test merge + pd_client.must_add_peer(r3_id, new_learner_peer(3, 3003)); + pd_client.merge_region(r1_id, r3_id); + must_not_merged(pd_client.clone(), r1_id, Duration::from_millis(1000)); + pd_client.must_add_peer(r1_id, new_learner_peer(3, 3001)); + pd_client.must_merge(r1_id, r3_id); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![3])); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![3])); + + fail::remove("on_can_apply_snapshot"); + cluster.shutdown(); +} + +#[test] +fn test_fall_back_to_slow_path() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 2); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + cluster.must_put(b"k2", b"v2"); + + fail::cfg("ffi_fast_add_peer_fail_after_write", "return(1)").unwrap(); + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + check_key(&cluster, b"k2", b"v2", Some(true), None, Some(vec![1, 2])); + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 2).is_some() + }); + + fail::remove("ffi_fast_add_peer_fail_after_write"); + fail::remove("on_can_apply_snapshot"); + fail::remove("on_pre_persist_with_finish"); + fail::remove("go_fast_path_not_allow"); + cluster.shutdown(); +} + +#[test] +fn test_single_replica_migrate() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + + // Fast add peer 2 + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 2).is_some() + }); + + fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); + + // Remove peer 2. + pd_client.must_remove_peer(1, new_learner_peer(2, 2)); + must_wait_until_cond_generic( + &cluster, + 1, + None, + &|states: &HashMap| -> bool { states.get(&2).is_none() }, + ); + + // Remove peer 2 and then add some new logs. + cluster.must_put(b"krm2", b"v"); + check_key(&cluster, b"krm2", b"v", Some(true), None, Some(vec![1])); + + // Try fast add peer from removed peer 2. + // TODO It will fallback to slow path if we don't support single replica + // migration. + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(3, 3)); + check_key(&cluster, b"krm2", b"v", Some(true), None, Some(vec![3])); + std::thread::sleep(std::time::Duration::from_millis(2000)); + must_wait_until_cond_generic( + &cluster, + 1, + None, + &|states: &HashMap| -> bool { states.get(&3).is_some() }, + ); + fail::remove("go_fast_path_not_allow"); + + fail::remove("on_can_apply_snapshot"); + fail::remove("on_pre_persist_with_finish"); + cluster.shutdown(); +} diff --git a/proxy_tests/proxy/ffi.rs b/proxy_tests/proxy/ffi.rs new file mode 100644 index 00000000000..9694d76529e --- /dev/null +++ b/proxy_tests/proxy/ffi.rs @@ -0,0 +1,99 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_store_ffi::{ + get_engine_store_server_helper, RawCppPtr, RawCppPtrArr, RawCppPtrTuple, RawVoidPtr, + UnwrapExternCFunc, +}; +use new_mock_engine_store::{ + mock_cluster::init_global_ffi_helper_set, mock_store::RawCppPtrTypeImpl, +}; + +#[test] +fn test_tuple_of_raw_cpp_ptr() { + tikv_util::set_panic_hook(true, "./"); + unsafe { + init_global_ffi_helper_set(); + let helper = get_engine_store_server_helper(); + + let len = 10; + let mut v: Vec = vec![]; + + for i in 0..len { + let s = format!("s{}", i); + let raw_cpp_ptr = (helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into()); + v.push(raw_cpp_ptr); + } + + let (ptr_v, l, cap) = v.into_raw_parts(); + for i in l..cap { + let v = ptr_v.add(i); + (*v).ptr = std::ptr::null_mut(); + (*v).type_ = RawCppPtrTypeImpl::None.into(); + } + assert_ne!(l, cap); + let cpp_ptr_tp = RawCppPtrTuple { + inner: ptr_v, + len: cap as u64, + }; + drop(cpp_ptr_tp); + } +} + +#[test] +fn test_array_of_raw_cpp_ptr() { + tikv_util::set_panic_hook(true, "./"); + unsafe { + init_global_ffi_helper_set(); + let helper = get_engine_store_server_helper(); + + let len = 10; + let mut v: Vec = vec![]; + + for i in 0..len { + let s = format!("s{}", i); + let raw_cpp_ptr = (helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into()); + let raw_void_ptr = raw_cpp_ptr.into_raw(); + v.push(raw_void_ptr); + } + + let (ptr_v, l, cap) = v.into_raw_parts(); + for i in l..cap { + let v = ptr_v.add(i); + *v = std::ptr::null_mut(); + } + assert_ne!(l, cap); + let cpp_ptr_arr = RawCppPtrArr { + inner: ptr_v, + type_: RawCppPtrTypeImpl::String.into(), + len: cap as u64, + }; + drop(cpp_ptr_arr); + } +} + +#[test] +fn test_carray_of_raw_cpp_ptr() { + tikv_util::set_panic_hook(true, "./"); + unsafe { + init_global_ffi_helper_set(); + let helper = get_engine_store_server_helper(); + + const LEN: usize = 10; + let mut v: [RawVoidPtr; LEN] = [std::ptr::null_mut(); LEN]; + + for i in 0..LEN { + let i = i as usize; + let s = format!("s{}", i); + let raw_cpp_ptr = (helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into()); + let raw_void_ptr = raw_cpp_ptr.into_raw(); + v[i] = raw_void_ptr; + } + + let pv1 = Box::into_raw(Box::new(v)); + (helper.fn_gc_raw_cpp_ptr_carr.into_inner())( + pv1 as RawVoidPtr, + RawCppPtrTypeImpl::String.into(), + LEN as u64, + ); + } +} diff --git a/proxy_tests/proxy/mod.rs b/proxy_tests/proxy/mod.rs index c2d2336999d..c464315afdc 100644 --- a/proxy_tests/proxy/mod.rs +++ b/proxy_tests/proxy/mod.rs @@ -3,11 +3,15 @@ #![feature(custom_test_frameworks)] #![test_runner(test_util::run_failpoint_tests)] #![recursion_limit = "100"] +#![feature(vec_into_raw_parts)] +#![feature(slice_pattern)] #[macro_use] extern crate slog_global; mod config; +mod fast_add_peer; +mod ffi; mod flashback; mod normal; mod proxy; diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 43891c2310d..bb09dfaee61 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -26,7 +26,7 @@ pub use kvproto::{ }; pub use new_mock_engine_store::{ config::Config, - get_apply_state, get_raft_local_state, get_region_local_state, make_new_region, + general_get_apply_state, general_get_region_local_state, get_raft_local_state, make_new_region, mock_cluster::{new_put_cmd, new_request, FFIHelperSet}, must_get_equal, must_get_none, node::NodeCluster, @@ -42,7 +42,7 @@ pub use test_raftstore::{new_learner_peer, new_peer}; pub use tikv_util::{ box_err, box_try, config::{ReadableDuration, ReadableSize}, - store::find_peer, + store::{find_peer, find_peer_by_id}, time::Duration, HandyRwLock, }; @@ -76,7 +76,7 @@ pub struct States { pub fn iter_ffi_helpers>( cluster: &Cluster, store_ids: Option>, - f: &mut dyn FnMut(u64, &engine_rocks::RocksEngine, &mut FFIHelperSet) -> (), + f: &mut dyn FnMut(u64, &engine_store_ffi::TiFlashEngine, &mut FFIHelperSet) -> (), ) { cluster.iter_ffi_helpers(store_ids, f); } @@ -90,7 +90,7 @@ pub fn maybe_collect_states( iter_ffi_helpers( cluster, store_ids, - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |id: u64, engine: &engine_store_ffi::TiFlashEngine, ffi: &mut FFIHelperSet| { let server = &ffi.engine_store_server; let raft_engine = &cluster.get_engines(id).raft; if let Some(region) = server.kvstore.get(®ion_id) { @@ -98,8 +98,8 @@ pub fn maybe_collect_states( Ok(Some(i)) => i, _ => unreachable!(), }; - let apply_state = get_apply_state(&engine, region_id); - let region_state = get_region_local_state(&engine, region_id); + let apply_state = general_get_apply_state(engine, region_id); + let region_state = general_get_region_local_state(engine, region_id); let raft_state = get_raft_local_state(raft_engine, region_id); if apply_state.is_none() { return; @@ -134,6 +134,7 @@ pub fn collect_all_states(cluster: &Cluster, region_id: u64) -> Has } pub fn new_mock_cluster(id: u64, count: usize) -> (Cluster, Arc) { + tikv_util::set_panic_hook(true, "./"); let pd_client = Arc::new(TestPdClient::new(0, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(pd_client.clone()))); let mut cluster = Cluster::new(id, count, sim, pd_client.clone(), ProxyConfig::default()); @@ -192,7 +193,7 @@ pub fn must_get_mem( std::thread::sleep(std::time::Duration::from_millis(20)); } let s = std::str::from_utf8(key).unwrap_or(""); - panic!( + let e = format!( "can't get mem value {:?} for key {}({}) in store {} cf {:?}, actual {:?}", value.map(tikv_util::escape), log_wrappers::hex_encode_upper(key), @@ -200,7 +201,9 @@ pub fn must_get_mem( node_id, cf, last_res, - ) + ); + error!("{}", e); + panic!("{}", e); } pub fn must_put_and_check_key_with_generator (String, String)>( @@ -606,6 +609,9 @@ pub fn must_wait_until_cond_node( break; } } + } else { + // If region not exists in some store. + ok = false; } } if ok { @@ -619,6 +625,27 @@ pub fn must_wait_until_cond_node( } } +pub fn must_wait_until_cond_generic( + cluster: &Cluster, + region_id: u64, + store_ids: Option>, + pred: &dyn Fn(&HashMap) -> bool, +) -> HashMap { + let mut retry = 0; + loop { + let new_states = maybe_collect_states(&cluster, region_id, store_ids.clone()); + let ok = pred(&new_states); + if ok { + break new_states; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + retry += 1; + if retry >= 30 { + panic!("states not as expect after timeout") + } + } +} + pub fn force_compact_log( cluster: &mut Cluster, key: &[u8], @@ -673,13 +700,13 @@ pub fn must_not_merged(pd_client: Arc, from: u64, duration: Durati let timer = tikv_util::time::Instant::now(); loop { let region = futures::executor::block_on(pd_client.get_region_by_id(from)).unwrap(); - if let Some(r) = region { + if let Some(_) = region { if timer.saturating_elapsed() > duration { return; } } else { panic!("region {} is merged.", from); } - std::thread::sleep_ms(10); + std::thread::sleep(std::time::Duration::from_millis(10)); } } diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index 51362ef3917..7432c53ba15 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -145,8 +145,9 @@ fn test_get_region_local_state() { } /// This test is very important. -/// If make sure we can add learner peer for a store which is not started +/// It make sure we can add learner peer for a store which is not started /// actually. +/// We don't start the absent learner peer in this test. #[test] fn test_add_absent_learner_peer_by_simple() { let (mut cluster, pd_client) = new_mock_cluster(0, 3); @@ -181,8 +182,9 @@ fn test_add_absent_learner_peer_by_simple() { } /// This test is very important. -/// If make sure we can add learner peer for a store which is not started +/// It make sure we can add learner peer for a store which is not started /// actually. +/// We don't start the absent learner peer in this test. #[test] fn test_add_absent_learner_peer_by_joint() { let (mut cluster, pd_client) = new_mock_cluster(0, 3); @@ -317,6 +319,8 @@ fn later_bootstrap_learner_peer( } } +/// We start the absent learner peer in this test. +/// We don't try to reuse data from other learner peer. #[test] fn test_add_delayed_started_learner_by_joint() { let (mut cluster, pd_client) = new_later_add_learner_cluster( @@ -373,7 +377,7 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ iter_ffi_helpers( cluster, Some(vec![from]), - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |_, _, ffi: &mut FFIHelperSet| { let server = &mut ffi.engine_store_server; maybe_source_region = server.kvstore.get(®ion_id).cloned(); }, @@ -386,7 +390,7 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ iter_ffi_helpers( cluster, Some(vec![to]), - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |id: u64, _, ffi: &mut FFIHelperSet| { let server = &mut ffi.engine_store_server; assert!(server.kvstore.get(®ion_id).is_none()); @@ -429,6 +433,9 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ } } +/// We start the absent learner peer in this test. +/// We try to reuse data from other learner peer. +/// We don't use a snapshot to initialize a peer. #[test] fn test_add_delayed_started_learner_no_snapshot() { // fail::cfg("before_tiflash_check_double_write", "return").unwrap(); @@ -471,6 +478,8 @@ fn test_add_delayed_started_learner_no_snapshot() { later_bootstrap_learner_peer(&mut cluster, vec![5], 1); // After that, we manually compose data, to avoid snapshot sending. recover_from_peer(&cluster, 4, 5, 1); + + cluster.must_put(b"m1", b"v1"); // Add node 5 to cluster. pd_client.must_add_peer(1, new_learner_peer(5, 5)); @@ -515,6 +524,9 @@ fn test_add_delayed_started_learner_no_snapshot() { // fail::remove("before_tiflash_do_write"); } +/// We start the absent learner peer in this test. +/// We try to reuse data from other learner peer. +/// We use a snapshot to initialize a peer. #[test] fn test_add_delayed_started_learner_snapshot() { let (mut cluster, pd_client) = new_later_add_learner_cluster( @@ -603,7 +615,7 @@ fn test_add_delayed_started_learner_snapshot() { iter_ffi_helpers( &cluster, Some(vec![5]), - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |id: u64, _, ffi: &mut FFIHelperSet| { (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { assert_eq!(e.pre_handle_count.load(Ordering::SeqCst), 1); }); diff --git a/proxy_tests/proxy/snapshot.rs b/proxy_tests/proxy/snapshot.rs index 69211e2bdfd..628fb06811d 100644 --- a/proxy_tests/proxy/snapshot.rs +++ b/proxy_tests/proxy/snapshot.rs @@ -292,6 +292,7 @@ fn test_prehandle_fail() { #[test] fn test_split_merge() { let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); // Can always apply snapshot immediately @@ -331,7 +332,7 @@ fn test_split_merge() { assert_eq!(server.kvstore.get(&r1_new.get_id()).unwrap().region, r1_new); assert_eq!(server.kvstore.get(&r3_new.get_id()).unwrap().region, r3_new); - // Can get from disk + // Can get from disk, note in old version, we don't support migrate memory data check_key(&cluster, b"k1", b"v1", None, Some(true), None); check_key(&cluster, b"k3", b"v3", None, Some(true), None); // TODO Region in memory data must not contradict, but now we do not @@ -358,7 +359,7 @@ fn test_split_merge() { r3_new2 ); - // Can get from disk + // Can get from disk, note in old version, we don't support migrate memory data check_key(&cluster, b"k1", b"v1", None, Some(true), None); check_key(&cluster, b"k3", b"v3", None, Some(true), None); // TODO Region in memory data must not contradict, but now we do not delete data diff --git a/raftstore-proxy/Cargo.toml b/raftstore-proxy/Cargo.toml index 5ab8af974a7..074c9c49c80 100644 --- a/raftstore-proxy/Cargo.toml +++ b/raftstore-proxy/Cargo.toml @@ -14,6 +14,7 @@ portable = ["proxy_server/portable"] sse = ["proxy_server/sse"] mem-profiling = ["proxy_server/mem-profiling"] failpoints = ["proxy_server/failpoints"] +enable-pagestorage = ["proxy_server/enable-pagestorage"] cloud-aws = ["proxy_server/cloud-aws"] cloud-gcp = ["proxy_server/cloud-gcp"] diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 519af996bc4..9705433b49c 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 15776819379826780689ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 17394545035928865111ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index 49b82c3704c..967508ce0ed 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -6,6 +6,12 @@ namespace DB { struct EngineStoreServerWrap; +enum class SpecialCppPtrType : uint32_t { + None = 0, + TupleOfRawCppPtr = 1, + ArrayOfRawCppPtr = 2, +}; + enum class EngineStoreApplyRes : uint32_t { None = 0, Persist, @@ -86,6 +92,34 @@ struct CppStrWithView { BaseBuffView view; }; +struct PageAndCppStrWithView { + RawCppPtr page; + RawCppPtr key; + BaseBuffView page_view; + BaseBuffView key_view; +}; + +struct RawCppPtrCarr { + RawVoidPtr inner; + const uint64_t len; + RawCppPtrType type; +}; + +// An tuple of pointers, like `void **`, +// Can be used to represent structures. +struct RawCppPtrTuple { + RawCppPtr *inner; + const uint64_t len; +}; + +// An array of pointers(same type), like `T **`, +// Can be used to represent arrays. +struct RawCppPtrArr { + RawVoidPtr *inner; + const uint64_t len; + RawCppPtrType type; +}; + enum class HttpRequestStatus : uint8_t { Ok = 0, ErrorParam, @@ -143,6 +177,21 @@ enum class KVGetStatus : uint32_t { NotFound, }; +enum class FastAddPeerStatus : uint32_t { + Ok = 0, + WaitForData, + OtherError, + NoSuitable, + BadData, + FailedInject, +}; + +struct FastAddPeerRes { + FastAddPeerStatus status; + CppStrWithView apply_state; + CppStrWithView region; +}; + struct RaftStoreProxyFFIHelper { RaftStoreProxyPtr proxy_ptr; RaftProxyStatus (*fn_handle_get_proxy_status)(RaftStoreProxyPtr); @@ -190,6 +239,22 @@ struct EngineStoreServerHelper { uint8_t (*fn_need_flush_data)(EngineStoreServerWrap *, uint64_t); uint8_t (*fn_try_flush_data)(EngineStoreServerWrap *, uint64_t, uint8_t, uint64_t, uint64_t); + RawCppPtr (*fn_create_write_batch)(const EngineStoreServerWrap *); + void (*fn_write_batch_put_page)(RawVoidPtr, BaseBuffView, BaseBuffView); + void (*fn_write_batch_del_page)(RawVoidPtr, BaseBuffView); + uint64_t (*fn_write_batch_size)(RawVoidPtr); + uint8_t (*fn_write_batch_is_empty)(RawVoidPtr); + void (*fn_write_batch_merge)(RawVoidPtr, RawVoidPtr); + void (*fn_write_batch_clear)(RawVoidPtr); + void (*fn_consume_write_batch)(const EngineStoreServerWrap *, RawVoidPtr); + CppStrWithView (*fn_handle_read_page)(const EngineStoreServerWrap *, + BaseBuffView); + RawCppPtrCarr (*fn_handle_scan_page)(const EngineStoreServerWrap *, + BaseBuffView, BaseBuffView); + void (*fn_handle_purge_pagestorage)(const EngineStoreServerWrap *); + CppStrWithView (*fn_handle_seek_ps_key)(const EngineStoreServerWrap *, + BaseBuffView); + uint8_t (*fn_ps_is_empty)(const EngineStoreServerWrap *); void (*fn_atomic_update_proxy)(EngineStoreServerWrap *, RaftStoreProxyFFIHelper *); void (*fn_handle_destroy)(EngineStoreServerWrap *, uint64_t); @@ -208,6 +273,8 @@ struct EngineStoreServerHelper { BaseBuffView body); uint8_t (*fn_check_http_uri_available)(BaseBuffView); void (*fn_gc_raw_cpp_ptr)(RawVoidPtr, RawCppPtrType); + void (*fn_gc_raw_cpp_ptr_carr)(RawVoidPtr, RawCppPtrType, uint64_t); + void (*fn_gc_special_raw_cpp_ptr)(RawVoidPtr, uint64_t, SpecialCppPtrType); CppStrWithView (*fn_get_config)(EngineStoreServerWrap *, uint8_t full); void (*fn_set_store)(EngineStoreServerWrap *, BaseBuffView); void (*fn_set_pb_msg_by_bytes)(MsgPBType type, RawVoidPtr ptr, @@ -215,5 +282,7 @@ struct EngineStoreServerHelper { void (*fn_handle_safe_ts_update)(EngineStoreServerWrap *, uint64_t region_id, uint64_t self_safe_ts, uint64_t leader_safe_ts); + FastAddPeerRes (*fn_fast_add_peer)(EngineStoreServerWrap *, + uint64_t region_id, uint64_t new_peer_id); }; } // namespace DB diff --git a/scripts/check-docker-build b/scripts/check-docker-build index 6a505f31a89..0eee0c5cf1f 100755 --- a/scripts/check-docker-build +++ b/scripts/check-docker-build @@ -2,7 +2,7 @@ # This script checks if all cargo targets have path specifications. set -euo pipefail -for i in $(git ls-files | grep 'Cargo.toml' | grep -v 'fuzz/\|./profiler/'); do +for i in $(git ls-files | grep 'Cargo.toml' | grep -v 'fuzz/'); do for target in "test" "bench" "bin" "example"; do # from "[[test]]" to the first trailing empty line matches=$(sed -n "/\[\[$target\]\]/,/^$/ p" $i) diff --git a/src/config/mod.rs b/src/config/mod.rs index 6ed8da3f111..38d69f1ab29 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -55,6 +55,7 @@ use raftstore::{ coprocessor::{Config as CopConfig, RegionInfoAccessor}, store::{CompactionGuardGeneratorFactory, Config as RaftstoreConfig, SplitConfig}, }; +use resource_control::Config as ResourceControlConfig; use resource_metering::Config as ResourceMeteringConfig; use security::SecurityConfig; use serde::{ @@ -115,7 +116,8 @@ fn bloom_filter_ratio(et: EngineType) -> f64 { EngineType::RaftKv => 0.1, // In v2, every peer has its own tablet. The data scale is about tens of // GiBs. We only need a small portion for those key. - EngineType::RaftKv2 => 0.005, + // TODO: disable it for now until find out the proper ratio + EngineType::RaftKv2 => 0.0, } } @@ -344,7 +346,7 @@ macro_rules! cf_config { #[online_config(skip)] pub enable_doubly_skiplist: bool, #[online_config(skip)] - pub enable_compaction_guard: bool, + pub enable_compaction_guard: Option, #[online_config(skip)] pub compaction_guard_min_output_file_size: ReadableSize, #[online_config(skip)] @@ -596,7 +598,7 @@ macro_rules! build_cf_opt { if $opt.enable_doubly_skiplist { cf_opts.set_doubly_skiplist(); } - if $opt.enable_compaction_guard { + if $opt.enable_compaction_guard.unwrap_or(false) { if let Some(provider) = $region_info_provider { let factory = CompactionGuardGeneratorFactory::new( $cf_name, @@ -629,7 +631,7 @@ impl Default for DefaultCfConfig { let total_mem = SysQuota::memory_limit_in_bytes(); DefaultCfConfig { - block_size: ReadableSize::kb(64), + block_size: ReadableSize::kb(16), block_cache_size: memory_limit_for_cf(false, CF_DEFAULT, total_mem), disable_block_cache: false, cache_index_and_filter_blocks: true, @@ -671,7 +673,7 @@ impl Default for DefaultCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Zstd, @@ -754,7 +756,7 @@ impl Default for WriteCfConfig { }; WriteCfConfig { - block_size: ReadableSize::kb(64), + block_size: ReadableSize::kb(16), block_cache_size: memory_limit_for_cf(false, CF_WRITE, total_mem), disable_block_cache: false, cache_index_and_filter_blocks: true, @@ -796,7 +798,7 @@ impl Default for WriteCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Zstd, @@ -902,7 +904,7 @@ impl Default for LockCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -985,7 +987,7 @@ impl Default for RaftCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -1218,6 +1220,8 @@ impl DbConfig { match engine { EngineType::RaftKv => { self.allow_concurrent_memtable_write.get_or_insert(true); + self.defaultcf.enable_compaction_guard.get_or_insert(true); + self.writecf.enable_compaction_guard.get_or_insert(true); } EngineType::RaftKv2 => { self.enable_multi_batch_write.get_or_insert(false); @@ -1226,6 +1230,10 @@ impl DbConfig { self.write_buffer_limit.get_or_insert(ReadableSize( (total_mem * WRITE_BUFFER_MEMORY_LIMIT_RATE) as u64, )); + self.defaultcf.disable_write_stall = true; + self.writecf.disable_write_stall = true; + self.lockcf.disable_write_stall = true; + self.raftcf.disable_write_stall = true; } } } @@ -1256,7 +1264,7 @@ impl DbConfig { } } - pub fn build_opt(&self, shared: &DbResources) -> RocksDbOptions { + pub fn build_opt(&self, shared: &DbResources, for_engine: EngineType) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1298,7 +1306,9 @@ impl DbConfig { if let Some(b) = self.paranoid_checks { opts.set_paranoid_checks(b); } - opts.set_info_log(RocksdbLogger::default()); + if for_engine == EngineType::RaftKv { + opts.set_info_log(RocksdbLogger::default()); + } opts.set_info_log_level(self.info_log_level.into()); if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); @@ -1388,7 +1398,7 @@ impl DbConfig { // prevent mistakenly inputting too large values, the max limit is made // according to the cpu quota * 10. Notice 10 is only an estimate, not an // empirical value. - let limit = SysQuota::cpu_cores_quota() as i32 * 10; + let limit = (SysQuota::cpu_cores_quota() * 10.0) as i32; if self.max_background_jobs <= 0 || self.max_background_jobs > limit { return Err(format!( "max_background_jobs should be greater than 0 and less than or equal to {:?}", @@ -1475,7 +1485,7 @@ impl Default for RaftDefaultCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -2645,7 +2655,7 @@ pub struct CdcConfig { impl Default for CdcConfig { fn default() -> Self { Self { - min_ts_interval: ReadableDuration::millis(200), + min_ts_interval: ReadableDuration::secs(1), hibernate_regions_compatible: true, // 4 threads for incremental scan. incremental_scan_threads: 4, @@ -3037,6 +3047,9 @@ pub struct TikvConfig { #[online_config(skip)] pub causal_ts: CausalTsConfig, + + #[online_config(submodule)] + pub resource_control: ResourceControlConfig, } impl Default for TikvConfig { @@ -3079,6 +3092,7 @@ impl Default for TikvConfig { resource_metering: ResourceMeteringConfig::default(), backup_stream: BackupStreamConfig::default(), causal_ts: CausalTsConfig::default(), + resource_control: ResourceControlConfig::default(), } } } @@ -3122,6 +3136,9 @@ impl TikvConfig { if self.storage.engine == EngineType::RaftKv2 { self.raft_store.store_io_pool_size = cmp::max(self.raft_store.store_io_pool_size, 1); + if !self.raft_engine.enable { + panic!("partitioned-raft-kv only supports raft log engine."); + } } self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; @@ -4409,9 +4426,10 @@ mod tests { fn test_rocks_rate_limit_zero() { let mut tikv_cfg = TikvConfig::default(); tikv_cfg.rocksdb.rate_bytes_per_sec = ReadableSize(0); + let resource = tikv_cfg.rocksdb.build_resources(Arc::new(Env::default())); tikv_cfg .rocksdb - .build_opt(&tikv_cfg.rocksdb.build_resources(Arc::new(Env::default()))); + .build_opt(&resource, tikv_cfg.storage.engine); } #[test] @@ -4572,12 +4590,10 @@ mod tests { Arc, ) { assert_eq!(F::TAG, cfg.storage.api_version()); + let resource = cfg.rocksdb.build_resources(Arc::default()); let engine = RocksDBEngine::new( &cfg.storage.data_dir, - Some( - cfg.rocksdb - .build_opt(&cfg.rocksdb.build_resources(Arc::new(Env::default()))), - ), + Some(cfg.rocksdb.build_opt(&resource, cfg.storage.engine)), cfg.rocksdb.build_cf_opts( &cfg.rocksdb .build_cf_resources(cfg.storage.block_cache.build_shared_cache()), @@ -4915,14 +4931,8 @@ mod tests { let max_pool_size = std::cmp::max(4, SysQuota::cpu_cores_quota() as usize); let check_scale_pool_size = |size: usize, ok: bool| { - let origin_pool_size = scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(); - let origin_pool_size_high = scheduler - .get_sched_pool(CommandPri::High) - .pool - .get_pool_size(); + let origin_pool_size = scheduler.get_sched_pool().get_pool_size(CommandPri::Normal); + let origin_pool_size_high = scheduler.get_sched_pool().get_pool_size(CommandPri::High); let res = cfg_controller .update_config("storage.scheduler-worker-pool-size", &format!("{}", size)); let (expected_size, expected_size_high) = if ok { @@ -4933,17 +4943,11 @@ mod tests { (origin_pool_size, origin_pool_size_high) }; assert_eq!( - scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::Normal), expected_size ); assert_eq!( - scheduler - .get_sched_pool(CommandPri::High) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::High), expected_size_high ); }; @@ -5200,7 +5204,7 @@ mod tests { // Test comopaction guard disabled. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: false, + enable_compaction_guard: Some(false), ..Default::default() }; let provider = Some(MockRegionInfoProvider::new(vec![])); @@ -5213,7 +5217,7 @@ mod tests { // Test compaction guard enabled but region info provider is missing. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, + enable_compaction_guard: Some(true), ..Default::default() }; let provider: Option = None; @@ -5226,7 +5230,7 @@ mod tests { // Test compaction guard enabled. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(4), compaction_guard_max_output_file_size: ReadableSize::mb(64), ..Default::default() @@ -5538,22 +5542,27 @@ mod tests { cfg.raft_engine.mut_config().memory_limit = None; cfg.coprocessor_v2.coprocessor_plugin_directory = None; // Default is `None`, which is represented by not setting the key. cfg.rocksdb.write_buffer_limit = None; + cfg.rocksdb.defaultcf.enable_compaction_guard = None; cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.defaultcf.level0_stop_writes_trigger = None; cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.defaultcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.writecf.enable_compaction_guard = None; cfg.rocksdb.writecf.level0_slowdown_writes_trigger = None; cfg.rocksdb.writecf.level0_stop_writes_trigger = None; cfg.rocksdb.writecf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.writecf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.lockcf.enable_compaction_guard = None; cfg.rocksdb.lockcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.lockcf.level0_stop_writes_trigger = None; cfg.rocksdb.lockcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.lockcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.raftcf.enable_compaction_guard = None; cfg.rocksdb.raftcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.raftcf.level0_stop_writes_trigger = None; cfg.rocksdb.raftcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.raftcf.hard_pending_compaction_bytes_limit = None; + cfg.raftdb.defaultcf.enable_compaction_guard = None; cfg.raftdb.defaultcf.level0_slowdown_writes_trigger = None; cfg.raftdb.defaultcf.level0_stop_writes_trigger = None; cfg.raftdb.defaultcf.soft_pending_compaction_bytes_limit = None; diff --git a/src/coprocessor/checksum.rs b/src/coprocessor/checksum.rs index 52bd0a60184..3778f549427 100644 --- a/src/coprocessor/checksum.rs +++ b/src/coprocessor/checksum.rs @@ -1,5 +1,6 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +use api_version::{keyspace::KvPair, ApiV1}; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -18,7 +19,7 @@ use crate::{ // `ChecksumContext` is used to handle `ChecksumRequest` pub struct ChecksumContext { req: ChecksumRequest, - scanner: RangesScanner>>, + scanner: RangesScanner>, ApiV1>, } impl ChecksumContext { @@ -73,12 +74,13 @@ impl RequestHandler for ChecksumContext { let mut prefix_digest = crc64fast::Digest::new(); prefix_digest.write(&old_prefix); - while let Some((k, v)) = self.scanner.next().await? { + while let Some(row) = self.scanner.next().await? { + let (k, v) = row.kv(); if !k.starts_with(&new_prefix) { return Err(box_err!("Wrong prefix expect: {:?}", new_prefix)); } checksum = - checksum_crc64_xor(checksum, prefix_digest.clone(), &k[new_prefix.len()..], &v); + checksum_crc64_xor(checksum, prefix_digest.clone(), &k[new_prefix.len()..], v); total_kvs += 1; total_bytes += k.len() + v.len() + old_prefix.len() - new_prefix.len(); } diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index ce575859e59..31a6df181d5 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -2,8 +2,9 @@ mod storage_impl; -use std::sync::Arc; +use std::{marker::PhantomData, sync::Arc}; +use api_version::KvFormat; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -18,7 +19,7 @@ use crate::{ tikv_util::quota_limiter::QuotaLimiter, }; -pub struct DagHandlerBuilder { +pub struct DagHandlerBuilder { req: DagRequest, ranges: Vec, store: S, @@ -29,9 +30,10 @@ pub struct DagHandlerBuilder { is_cache_enabled: bool, paging_size: Option, quota_limiter: Arc, + _phantom: PhantomData, } -impl DagHandlerBuilder { +impl DagHandlerBuilder { pub fn new( req: DagRequest, ranges: Vec, @@ -54,6 +56,7 @@ impl DagHandlerBuilder { is_cache_enabled, paging_size, quota_limiter, + _phantom: PhantomData, } } @@ -65,7 +68,7 @@ impl DagHandlerBuilder { pub fn build(self) -> Result> { COPR_DAG_REQ_COUNT.with_label_values(&["batch"]).inc(); - Ok(BatchDagHandler::new( + Ok(BatchDagHandler::new::<_, F>( self.req, self.ranges, self.store, @@ -87,7 +90,7 @@ pub struct BatchDagHandler { } impl BatchDagHandler { - pub fn new( + pub fn new( req: DagRequest, ranges: Vec, store: S, @@ -100,7 +103,7 @@ impl BatchDagHandler { quota_limiter: Arc, ) -> Result { Ok(Self { - runner: tidb_query_executors::runner::BatchExecutorsRunner::from_request( + runner: tidb_query_executors::runner::BatchExecutorsRunner::from_request::<_, F>( req, ranges, TikvStorage::new(store, is_cache_enabled), diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 54fcaeb0489..6ac1bebc541 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -7,6 +7,7 @@ use std::{ use ::tracker::{ set_tls_tracker_token, with_tls_tracker, RequestInfo, RequestType, GLOBAL_TRACKERS, }; +use api_version::{dispatch_api_version, KvFormat}; use async_stream::try_stream; use concurrency_manager::ConcurrencyManager; use engine_traits::PerfLevel; @@ -147,6 +148,21 @@ impl Endpoint { /// /// It also checks if there are locks in memory blocking this read request. fn parse_request_and_check_memory_locks( + &self, + req: coppb::Request, + peer: Option, + is_streaming: bool, + ) -> Result<(RequestHandlerBuilder, ReqContext)> { + dispatch_api_version!(req.get_context().get_api_version(), { + self.parse_request_and_check_memory_locks_impl::(req, peer, is_streaming) + }) + } + + /// Parse the raw `Request` to create `RequestHandlerBuilder` and + /// `ReqContext`. Returns `Err` if fails. + /// + /// It also checks if there are locks in memory blocking this read request. + fn parse_request_and_check_memory_locks_impl( &self, mut req: coppb::Request, peer: Option, @@ -232,7 +248,7 @@ impl Endpoint { 0 => None, i => Some(i), }; - dag::DagHandlerBuilder::new( + dag::DagHandlerBuilder::<_, F>::new( dag, req_ctx.ranges.clone(), store, @@ -281,7 +297,7 @@ impl Endpoint { let quota_limiter = self.quota_limiter.clone(); builder = Box::new(move |snap, req_ctx| { - statistics::analyze::AnalyzeContext::new( + statistics::analyze::AnalyzeContext::<_, F>::new( analyze, req_ctx.ranges.clone(), start_ts, @@ -470,6 +486,11 @@ impl Endpoint { let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(&req_ctx.context, key_ranges); + let group_name = req_ctx + .context + .get_resource_group_name() + .as_bytes() + .to_owned(); // box the tracker so that moving it is cheap. let tracker = Box::new(Tracker::new(req_ctx, self.slow_log_threshold)); @@ -480,6 +501,7 @@ impl Endpoint { .in_resource_metering_tag(resource_tag), priority, task_id, + group_name, ) .map_err(|_| Error::MaxPendingTasksExceeded); async move { res.await? } @@ -578,6 +600,8 @@ impl Endpoint { response.set_locked(lock_info); } response.set_other_error(resp.take_other_error()); + // keep the exec details already generated. + response.set_exec_details_v2(resp.take_exec_details_v2()); GLOBAL_TRACKERS.with_tracker(cur_tracker, |tracker| { tracker.write_scan_detail( response.mut_exec_details_v2().mut_scan_detail_v2(), @@ -690,6 +714,11 @@ impl Endpoint { ) -> Result>> { let (tx, rx) = mpsc::channel::>(self.stream_channel_size); let priority = req_ctx.context.get_priority(); + let group_name = req_ctx + .context + .get_resource_group_name() + .as_bytes() + .to_owned(); let key_ranges = req_ctx .ranges .iter() @@ -712,6 +741,7 @@ impl Endpoint { }), priority, task_id, + group_name, ) .map_err(|_| Error::MaxPendingTasksExceeded)?; Ok(rx) diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 383f6161a1b..25ecf95653d 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -1,7 +1,8 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Reverse, collections::BinaryHeap, mem, sync::Arc}; +use std::{cmp::Reverse, collections::BinaryHeap, marker::PhantomData, mem, sync::Arc}; +use api_version::{keyspace::KvPair, KvFormat}; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -41,16 +42,17 @@ const ANALYZE_VERSION_V1: i32 = 1; const ANALYZE_VERSION_V2: i32 = 2; // `AnalyzeContext` is used to handle `AnalyzeReq` -pub struct AnalyzeContext { +pub struct AnalyzeContext { req: AnalyzeReq, storage: Option>>, ranges: Vec, storage_stats: Statistics, quota_limiter: Arc, is_auto_analyze: bool, + _phantom: PhantomData, } -impl AnalyzeContext { +impl AnalyzeContext { pub fn new( req: AnalyzeReq, ranges: Vec, @@ -77,13 +79,14 @@ impl AnalyzeContext { storage_stats: Statistics::default(), quota_limiter, is_auto_analyze, + _phantom: PhantomData, }) } // handle_column is used to process `AnalyzeColumnsReq` // it would build a histogram for the primary key(if needed) and // collectors for each column value. - async fn handle_column(builder: &mut SampleBuilder) -> Result> { + async fn handle_column(builder: &mut SampleBuilder) -> Result> { let (col_res, _) = builder.collect_columns_stats().await?; let res_data = { @@ -93,7 +96,7 @@ impl AnalyzeContext { Ok(res_data) } - async fn handle_mixed(builder: &mut SampleBuilder) -> Result> { + async fn handle_mixed(builder: &mut SampleBuilder) -> Result> { let (col_res, idx_res) = builder.collect_columns_stats().await?; let res_data = { @@ -109,7 +112,7 @@ impl AnalyzeContext { Ok(res_data) } - async fn handle_full_sampling(builder: &mut RowSampleBuilder) -> Result> { + async fn handle_full_sampling(builder: &mut RowSampleBuilder) -> Result> { let sample_res = builder.collect_column_stats().await?; let res_data = { let res = sample_res.into_proto(); @@ -122,7 +125,7 @@ impl AnalyzeContext { // it would build a histogram and count-min sketch of index values. async fn handle_index( req: AnalyzeIndexReq, - scanner: &mut RangesScanner>>, + scanner: &mut RangesScanner>, F>, is_common_handle: bool, ) -> Result> { let mut hist = Histogram::new(req.get_bucket_size() as usize); @@ -142,8 +145,8 @@ impl AnalyzeContext { } else { ANALYZE_VERSION_V1 }; - while let Some((key, _)) = scanner.next().await? { - let mut key = &key[..]; + while let Some(row) = scanner.next().await? { + let mut key = row.key(); if is_common_handle { table::check_record_key(key)?; key = &key[table::PREFIX_LEN..]; @@ -209,14 +212,14 @@ impl AnalyzeContext { } #[async_trait] -impl RequestHandler for AnalyzeContext { +impl RequestHandler for AnalyzeContext { async fn handle_request(&mut self) -> Result> { let ret = match self.req.get_tp() { AnalyzeType::TypeIndex | AnalyzeType::TypeCommonHandle => { let req = self.req.take_idx_req(); let ranges = std::mem::take(&mut self.ranges); - table::check_table_ranges(&ranges)?; - let mut scanner = RangesScanner::new(RangesScannerOptions { + table::check_table_ranges::(&ranges)?; + let mut scanner = RangesScanner::<_, F>::new(RangesScannerOptions { storage: self.storage.take().unwrap(), ranges: ranges .into_iter() @@ -240,7 +243,7 @@ impl RequestHandler for AnalyzeContext { let col_req = self.req.take_col_req(); let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = SampleBuilder::new(col_req, None, storage, ranges)?; + let mut builder = SampleBuilder::<_, F>::new(col_req, None, storage, ranges)?; let res = AnalyzeContext::handle_column(&mut builder).await; builder.data.collect_storage_stats(&mut self.storage_stats); res @@ -252,7 +255,8 @@ impl RequestHandler for AnalyzeContext { let idx_req = self.req.take_idx_req(); let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = SampleBuilder::new(col_req, Some(idx_req), storage, ranges)?; + let mut builder = + SampleBuilder::<_, F>::new(col_req, Some(idx_req), storage, ranges)?; let res = AnalyzeContext::handle_mixed(&mut builder).await; builder.data.collect_storage_stats(&mut self.storage_stats); res @@ -263,7 +267,7 @@ impl RequestHandler for AnalyzeContext { let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = RowSampleBuilder::new( + let mut builder = RowSampleBuilder::<_, F>::new( col_req, storage, ranges, @@ -302,8 +306,8 @@ impl RequestHandler for AnalyzeContext { } } -struct RowSampleBuilder { - data: BatchTableScanExecutor>>, +struct RowSampleBuilder { + data: BatchTableScanExecutor>, F>, max_sample_size: usize, max_fm_sketch_size: usize, @@ -314,7 +318,7 @@ struct RowSampleBuilder { is_auto_analyze: bool, } -impl RowSampleBuilder { +impl RowSampleBuilder { fn new( mut req: AnalyzeColumnsReq, storage: TikvStorage>, @@ -784,8 +788,8 @@ impl Drop for BaseRowSampleCollector { } } -struct SampleBuilder { - data: BatchTableScanExecutor>>, +struct SampleBuilder { + data: BatchTableScanExecutor>, F>, max_bucket_size: usize, max_sample_size: usize, @@ -802,7 +806,7 @@ struct SampleBuilder { /// `SampleBuilder` is used to analyze columns. It collects sample from /// the result set using Reservoir Sampling algorithm, estimates NDVs /// using FM Sketch during the collecting process, and builds count-min sketch. -impl SampleBuilder { +impl SampleBuilder { fn new( mut req: AnalyzeColumnsReq, common_handle_req: Option, diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index d6e146adf11..9c0b79ff8b8 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -6,7 +6,7 @@ use ::tracker::{get_tls_tracker_token, with_tls_tracker}; use engine_traits::{PerfContext, PerfContextExt, PerfContextKind}; use kvproto::{kvrpcpb, kvrpcpb::ScanDetailV2}; use pd_client::BucketMeta; -use tikv_kv::{with_tls_engine, Engine}; +use tikv_kv::Engine; use tikv_util::time::{self, Duration, Instant}; use txn_types::Key; @@ -148,9 +148,7 @@ impl Tracker { } self.with_perf_context(|perf_context| { - if let Some(c) = perf_context { - c.start_observe(); - } + perf_context.start_observe(); }); self.current_stage = TrackerState::ItemBegan(now); } @@ -164,9 +162,7 @@ impl Tracker { self.total_storage_stats.add(&storage_stats); } self.with_perf_context(|perf_context| { - if let Some(c) = perf_context { - c.report_metrics(&[get_tls_tracker_token()]); - } + perf_context.report_metrics(&[get_tls_tracker_token()]); }); self.current_stage = TrackerState::ItemFinished(now); } else { @@ -361,7 +357,7 @@ impl Tracker { fn with_perf_context(&self, f: F) -> T where - F: FnOnce(&mut Option>) -> T, + F: FnOnce(&mut Box) -> T, { thread_local! { static SELECT: RefCell>> = RefCell::new(None); @@ -385,19 +381,13 @@ impl Tracker { }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - if c.is_none() { - *c = unsafe { - with_tls_engine::(|engine| { - engine.kv_engine().map(|engine| { - Box::new(engine.get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), - )) as Box - }) - }) - }; - } - f(&mut c) + let perf_context = c.get_or_insert_with(|| { + Box::new(E::Local::get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), + )) as Box + }); + f(perf_context) }) } } diff --git a/src/import/mod.rs b/src/import/mod.rs index d3a522ede5e..e2fa3729e52 100644 --- a/src/import/mod.rs +++ b/src/import/mod.rs @@ -29,7 +29,7 @@ pub fn make_rpc_error(err: E) -> RpcStatus { #[macro_export] macro_rules! send_rpc_response { - ($res:ident, $sink:ident, $label:ident, $timer:ident) => {{ + ($res:expr, $sink:ident, $label:ident, $timer:ident) => {{ let res = match $res { Ok(resp) => { IMPORT_RPC_DURATION diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 8ce6f9961fb..08eabe32f0c 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -1,7 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::HashMap, + collections::{HashMap, VecDeque}, future::Future, path::PathBuf, sync::{Arc, Mutex}, @@ -11,7 +11,7 @@ use std::{ use collections::HashSet; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; -use futures::{future::join_all, sink::SinkExt, stream::TryStreamExt, TryFutureExt}; +use futures::{sink::SinkExt, stream::TryStreamExt, TryFutureExt}; use futures_executor::{ThreadPool, ThreadPoolBuilder}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, @@ -19,9 +19,12 @@ use grpcio::{ use kvproto::{ encryptionpb::EncryptionMethod, errorpb, - import_sstpb::{RawWriteRequest_oneof_chunk as RawChunk, WriteRequest_oneof_chunk as Chunk, *}, + import_sstpb::{ + Error as ImportPbError, ImportSst, Range, RawWriteRequest_oneof_chunk as RawChunk, SstMeta, + SwitchMode, WriteRequest_oneof_chunk as Chunk, *, + }, kvrpcpb::Context, - raft_cmdpb::*, + raft_cmdpb::{CmdType, DeleteRequest, PutRequest, RaftCmdRequest, RaftRequestHeader, Request}, }; use protobuf::Message; use raftstore::{ @@ -44,6 +47,8 @@ use txn_types::{Key, WriteRef, WriteType}; use super::make_rpc_error; use crate::{import::duplicate_detect::DuplicateDetector, server::CONFIG_ROCKSDB_GAUGE}; +const MAX_INFLIGHT_RAFT_MSGS: usize = 64; + /// ImportSstService provides tikv-server with the ability to ingest SST files. /// /// It saves the SST sent from client to a file and then sends a command to @@ -74,6 +79,161 @@ pub struct SnapshotResult { term: u64, } +struct RequestCollector { + context: Context, + max_raft_req_size: usize, + /// Retain the last ts of each key in each request. + /// This is used for write CF because resolved ts observer hates duplicated + /// key in the same request. + write_reqs: HashMap, (Request, u64)>, + /// Collector favor that simple collect all items, and it do not contains + /// duplicated key-value. This is used for default CF. + default_reqs: HashMap, Request>, + /// Size of all `Request`s. + unpacked_size: usize, + + pending_raft_reqs: Vec, +} + +impl RequestCollector { + fn new(context: Context, max_raft_req_size: usize) -> Self { + Self { + context, + max_raft_req_size, + write_reqs: HashMap::default(), + default_reqs: HashMap::default(), + unpacked_size: 0, + pending_raft_reqs: Vec::new(), + } + } + + fn accept_kv(&mut self, cf: &str, is_delete: bool, k: Vec, v: Vec) { + // Need to skip the empty key/value that could break the transaction or cause + // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. + if k.is_empty() || (!is_delete && v.is_empty()) { + return; + } + let mut req = Request::default(); + if is_delete { + let mut del = DeleteRequest::default(); + del.set_key(k); + del.set_cf(cf.to_string()); + req.set_cmd_type(CmdType::Delete); + req.set_delete(del); + } else { + if cf == CF_WRITE && !write_needs_restore(&v) { + return; + } + + let mut put = PutRequest::default(); + put.set_key(k); + put.set_value(v); + put.set_cf(cf.to_string()); + req.set_cmd_type(CmdType::Put); + req.set_put(put); + } + self.accept(cf, req); + } + + // we need to remove duplicate keys in here, since + // in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 + // will panic if found duplicated entry during Vec. + fn accept(&mut self, cf: &str, req: Request) { + let k = key_from_request(&req); + match cf { + CF_WRITE => { + let (encoded_key, ts) = match Key::split_on_ts_for(k) { + Ok(k) => k, + Err(err) => { + warn!( + "key without ts, skipping"; + "key" => %log_wrappers::Value::key(k), + "err" => %err + ); + return; + } + }; + if self + .write_reqs + .get(encoded_key) + .map(|(_, old_ts)| *old_ts < ts.into_inner()) + .unwrap_or(true) + { + self.unpacked_size += req.compute_size() as usize; + if let Some((v, _)) = self + .write_reqs + .insert(encoded_key.to_owned(), (req, ts.into_inner())) + { + self.unpacked_size -= v.get_cached_size() as usize; + } + } + } + CF_DEFAULT => { + self.unpacked_size += req.compute_size() as usize; + if let Some(v) = self.default_reqs.insert(k.to_owned(), req) { + self.unpacked_size -= v.get_cached_size() as usize; + } + } + _ => unreachable!(), + } + + if self.unpacked_size >= self.max_raft_req_size { + self.pack_all(); + } + } + + #[cfg(test)] + fn drain_unpacked_reqs(&mut self, cf: &str) -> Vec { + let res: Vec = if cf == CF_DEFAULT { + self.default_reqs.drain().map(|(_, req)| req).collect() + } else { + self.write_reqs.drain().map(|(_, (req, _))| req).collect() + }; + for r in &res { + self.unpacked_size -= r.get_cached_size() as usize; + } + res + } + + #[inline] + fn drain_raft_reqs(&mut self, take_unpacked: bool) -> std::vec::Drain<'_, RaftCmdRequest> { + if take_unpacked { + self.pack_all(); + } + self.pending_raft_reqs.drain(..) + } + + fn pack_all(&mut self) { + if self.unpacked_size == 0 { + return; + } + let mut cmd = RaftCmdRequest::default(); + let mut header = make_request_header(self.context.clone()); + // Set the UUID of header to prevent raftstore batching our requests. + // The current `resolved_ts` observer assumes that each batch of request doesn't + // has two writes to the same key. (Even with 2 different TS). That was true + // for normal cases because the latches reject concurrency write to keys. + // However we have bypassed the latch layer :( + header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); + cmd.set_header(header); + let mut reqs: Vec<_> = self.write_reqs.drain().map(|(_, (req, _))| req).collect(); + reqs.append(&mut self.default_reqs.drain().map(|(_, req)| req).collect()); + if reqs.is_empty() { + debug_assert!(false, "attempt to pack an empty request"); + return; + } + cmd.set_requests(reqs.into()); + + self.pending_raft_reqs.push(cmd); + self.unpacked_size = 0; + } + + #[inline] + fn is_empty(&self) -> bool { + self.pending_raft_reqs.is_empty() && self.unpacked_size == 0 + } +} + impl ImportSstService where E: KvEngine, @@ -281,6 +441,101 @@ where Ok(resp) } } + + async fn apply_imp( + mut req: ApplyRequest, + importer: Arc, + router: Router, + limiter: Limiter, + max_raft_size: usize, + ) -> std::result::Result, ImportPbError> { + type RaftWriteFuture = futures::channel::oneshot::Receiver; + async fn handle_raft_write(fut: RaftWriteFuture) -> std::result::Result<(), ImportPbError> { + match fut.await { + Err(e) => { + let msg = format!("failed to complete raft command: {}", e); + let mut e = ImportPbError::default(); + e.set_message(msg); + return Err(e); + } + Ok(mut r) if r.response.get_header().has_error() => { + let mut e = ImportPbError::default(); + e.set_message("failed to complete raft command".to_string()); + e.set_store_error(r.response.take_header().take_error()); + return Err(e); + } + _ => {} + } + Ok(()) + } + + let mut range: Option = None; + + let mut collector = RequestCollector::new(req.take_context(), max_raft_size * 7 / 8); + let mut metas = req.take_metas(); + let mut rules = req.take_rewrite_rules(); + // For compatibility with old requests. + if req.has_meta() { + metas.push(req.take_meta()); + rules.push(req.take_rewrite_rule()); + } + let ext_storage = importer.wrap_kms( + importer + .external_storage_or_cache(req.get_storage_backend(), req.get_storage_cache_id())?, + false, + ); + + let mut inflight_futures: VecDeque = VecDeque::new(); + + let mut tasks = metas.iter().zip(rules.iter()).peekable(); + while let Some((meta, rule)) = tasks.next() { + let buff = importer.read_from_kv_file( + meta, + rule, + ext_storage.clone(), + req.get_storage_backend(), + &limiter, + )?; + if let Some(mut r) = importer.do_apply_kv_file( + meta.get_start_key(), + meta.get_end_key(), + meta.get_start_ts(), + meta.get_restore_ts(), + buff, + |k, v| collector.accept_kv(meta.get_cf(), meta.get_is_delete(), k, v), + )? { + if let Some(range) = range.as_mut() { + range.start = range.take_start().min(r.take_start()); + range.end = range.take_end().max(r.take_end()); + } else { + range = Some(r); + } + } + + let is_last_task = tasks.peek().is_none(); + for req in collector.drain_raft_reqs(is_last_task) { + while inflight_futures.len() >= MAX_INFLIGHT_RAFT_MSGS { + handle_raft_write(inflight_futures.pop_front().unwrap()).await?; + } + let (cb, future) = paired_future_callback(); + match router.send_command(req, Callback::write(cb), RaftCmdExtraOpts::default()) { + Ok(_) => inflight_futures.push_back(future), + Err(e) => { + let msg = format!("failed to send raft command: {}", e); + let mut e = ImportPbError::default(); + e.set_message(msg); + return Err(e); + } + } + } + } + assert!(collector.is_empty()); + for fut in inflight_futures { + handle_raft_write(fut).await?; + } + + Ok(range) + } } #[macro_export] @@ -375,8 +630,7 @@ where } let task = async move { - let res = Ok(SwitchModeResponse::default()); - crate::send_rpc_response!(res, sink, label, timer); + crate::send_rpc_response!(Ok(SwitchModeResponse::default()), sink, label, timer); }; ctx.spawn(task); } @@ -448,7 +702,7 @@ where .observe(start.saturating_elapsed().as_secs_f64()); if let Err(e) = importer.remove_dir(req.get_prefix()) { - let mut import_err = kvproto::import_sstpb::Error::default(); + let mut import_err = ImportPbError::default(); import_err.set_message(format!("failed to remove directory: {}", e)); resp.set_error(import_err); } @@ -456,176 +710,37 @@ where .with_label_values(&[label]) .observe(start.saturating_elapsed().as_secs_f64()); - let resp = Ok(resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, timer); }; self.threads.spawn(handle_task); } // Downloads KV file and performs key-rewrite then apply kv into this tikv // store. - fn apply( - &mut self, - _ctx: RpcContext<'_>, - mut req: ApplyRequest, - sink: UnarySink, - ) { + fn apply(&mut self, _ctx: RpcContext<'_>, req: ApplyRequest, sink: UnarySink) { let label = "apply"; - let timer = Instant::now_coarse(); - let importer = Arc::clone(&self.importer); + let start = Instant::now(); + let importer = self.importer.clone(); let router = self.router.clone(); let limiter = self.limiter.clone(); - let start = Instant::now(); - let raft_size = self.raft_entry_max_size; + let max_raft_size = self.raft_entry_max_size.0 as usize; let handle_task = async move { // Records how long the apply task waits to be scheduled. sst_importer::metrics::IMPORTER_APPLY_DURATION .with_label_values(&["queue"]) .observe(start.saturating_elapsed().as_secs_f64()); - let mut start_apply = Instant::now(); - let mut futs = vec![]; - let mut apply_resp = ApplyResponse::default(); - let context = req.take_context(); - let mut rules = req.take_rewrite_rules(); - let mut metas = req.take_metas(); - // For compatibility with old requests. - if req.has_meta() { - metas.push(req.take_meta()); - rules.push(req.take_rewrite_rule()); - } - let result = (|| -> Result<()> { - let mut cmd_reqs = vec![]; - let mut reqs_default = RequestCollector::from_cf(CF_DEFAULT); - let mut reqs_write = RequestCollector::from_cf(CF_WRITE); - let mut req_default_size = 0_u64; - let mut req_write_size = 0_u64; - let mut range: Option = None; - let ext_storage = { - let inner = importer.wrap_kms( - importer.external_storage_or_cache( - req.get_storage_backend(), - req.get_storage_cache_id(), - )?, - false, - ); - inner - }; - - for (i, meta) in metas.iter().enumerate() { - let (reqs, req_size) = if meta.get_cf() == CF_DEFAULT { - (&mut reqs_default, &mut req_default_size) - } else { - (&mut reqs_write, &mut req_write_size) - }; - - let mut build_req_fn = build_apply_request( - req_size, - raft_size.0, - reqs, - cmd_reqs.as_mut(), - meta.get_is_delete(), - meta.get_cf(), - context.clone(), - ); - - let buff = importer.read_from_kv_file( - meta, - &rules[i], - Arc::clone(&ext_storage), - req.get_storage_backend(), - &limiter, - )?; - let r: Option = importer.do_apply_kv_file( - meta.get_start_key(), - meta.get_end_key(), - meta.get_start_ts(), - meta.get_restore_ts(), - buff, - &mut build_req_fn, - )?; - - if let Some(mut r) = r { - range = match range { - Some(mut v) => { - let s = v.take_start().min(r.take_start()); - let e = v.take_end().max(r.take_end()); - Some(Range { - start: s, - end: e, - ..Default::default() - }) - } - None => Some(r), - }; - } - } + let mut resp = ApplyResponse::default(); - if !reqs_default.is_empty() { - let cmd = make_request(&mut reqs_default, context.clone()); - cmd_reqs.push(cmd); - IMPORTER_APPLY_BYTES.observe(req_default_size as _); - } - if !reqs_write.is_empty() { - let cmd = make_request(&mut reqs_write, context); - cmd_reqs.push(cmd); - IMPORTER_APPLY_BYTES.observe(req_write_size as _); - } - - start_apply = Instant::now(); - for cmd in cmd_reqs { - let (cb, future) = paired_future_callback(); - match router.send_command(cmd, Callback::write(cb), RaftCmdExtraOpts::default()) - { - Ok(_) => futs.push(future), - Err(e) => { - let mut import_err = kvproto::import_sstpb::Error::default(); - import_err.set_message(format!("failed to send raft command: {}", e)); - apply_resp.set_error(import_err); - } - } - } - if let Some(r) = range { - apply_resp.set_range(r); - } - Ok(()) - })(); - if let Err(e) = result { - apply_resp.set_error(e.into()); + match Self::apply_imp(req, importer, router, limiter, max_raft_size).await { + Ok(Some(r)) => resp.set_range(r), + Err(e) => resp.set_error(e), + _ => {} } - let resp = Ok(join_all(futs).await.iter().fold(apply_resp, |mut resp, x| { - match x { - Err(e) => { - let mut import_err = kvproto::import_sstpb::Error::default(); - import_err.set_message(format!("failed to complete raft command: {}", e)); - resp.set_error(import_err); - } - Ok(r) => { - if r.response.get_header().has_error() { - let mut import_err = kvproto::import_sstpb::Error::default(); - let err = r.response.get_header().get_error(); - import_err.set_message("failed to complete raft command".to_string()); - // FIXME: if there are many errors, we may lose some of them here. - import_err.set_store_error(err.clone()); - warn!("failed to apply the file to the store"; "error" => ?err); - resp.set_error(import_err); - } - } - } - resp - })); - - // Records how long the apply task waits to be scheduled. - sst_importer::metrics::IMPORTER_APPLY_DURATION - .with_label_values(&["apply"]) - .observe(start_apply.saturating_elapsed().as_secs_f64()); - sst_importer::metrics::IMPORTER_APPLY_DURATION - .with_label_values(&["finish"]) - .observe(start.saturating_elapsed().as_secs_f64()); debug!("finished apply kv file with {:?}", resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, start); }; self.block_threads.spawn_ok(handle_task); } @@ -668,7 +783,9 @@ where cipher, limiter, engine, - DownloadExt::default().cache_key(req.get_storage_cache_id()), + DownloadExt::default() + .cache_key(req.get_storage_cache_id()) + .req_type(req.get_request_type()), ); let mut resp = DownloadResponse::default(); match res.await { @@ -678,8 +795,7 @@ where }, Err(e) => resp.set_error(e.into()), } - let resp = Ok(resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, timer); }; self.threads.spawn(handle_task); @@ -848,8 +964,12 @@ where }); let ctx_task = async move { - let res = Ok(SetDownloadSpeedLimitResponse::default()); - crate::send_rpc_response!(res, sink, label, timer); + crate::send_rpc_response!( + Ok(SetDownloadSpeedLimitResponse::default()), + sink, + label, + timer + ); }; ctx.spawn(ctx_task); @@ -958,70 +1078,6 @@ fn pb_error_inc(type_: &str, e: &errorpb::Error) { IMPORTER_ERROR_VEC.with_label_values(&[type_, label]).inc(); } -enum RequestCollector { - /// Retain the last ts of each key in each request. - /// This is used for write CF because resolved ts observer hates duplicated - /// key in the same request. - RetainLastTs(HashMap, (Request, u64)>), - /// Collector favor that simple collect all items, and it do not contains - /// duplicated key-value. This is used for default CF. - KeepAll(HashMap, Request>), -} - -impl RequestCollector { - fn from_cf(cf: &str) -> Self { - match cf { - CF_DEFAULT | "" => Self::KeepAll(Default::default()), - CF_WRITE => Self::RetainLastTs(Default::default()), - _ => { - warn!("unknown cf name, using default request collector"; "cf" => %cf); - Self::RetainLastTs(Default::default()) - } - } - } - - fn accept(&mut self, req: Request) { - let k = key_from_request(&req); - match self { - RequestCollector::RetainLastTs(ref mut reqs) => { - let (encoded_key, ts) = match Key::split_on_ts_for(k) { - Ok(k) => k, - Err(err) => { - warn!("key without ts, skipping"; "key" => %log_wrappers::Value::key(k), "err" => %err); - return; - } - }; - if reqs - .get(encoded_key) - .map(|(_, old_ts)| *old_ts < ts.into_inner()) - .unwrap_or(true) - { - reqs.insert(encoded_key.to_owned(), (req, ts.into_inner())); - } - } - RequestCollector::KeepAll(ref mut reqs) => { - reqs.insert(k.to_owned(), req); - } - } - } - - fn drain(&mut self) -> Vec { - match self { - RequestCollector::RetainLastTs(ref mut reqs) => { - reqs.drain().map(|(_, (req, _))| req).collect() - } - RequestCollector::KeepAll(ref mut reqs) => reqs.drain().map(|(_, req)| req).collect(), - } - } - - fn is_empty(&self) -> bool { - match self { - RequestCollector::RetainLastTs(reqs) => reqs.is_empty(), - RequestCollector::KeepAll(reqs) => reqs.is_empty(), - } - } -} - fn key_from_request(req: &Request) -> &[u8] { if req.has_put() { return req.get_put().get_key(); @@ -1029,8 +1085,7 @@ fn key_from_request(req: &Request) -> &[u8] { if req.has_delete() { return req.get_delete().get_key(); } - warn!("trying to extract key from request is neither put nor delete."); - b"" + panic!("trying to extract key from request is neither put nor delete.") } fn make_request_header(mut context: Context) -> RaftRequestHeader { @@ -1042,77 +1097,6 @@ fn make_request_header(mut context: Context) -> RaftRequestHeader { header } -fn make_request(reqs: &mut RequestCollector, context: Context) -> RaftCmdRequest { - let mut cmd = RaftCmdRequest::default(); - let mut header = make_request_header(context); - // Set the UUID of header to prevent raftstore batching our requests. - // The current `resolved_ts` observer assumes that each batch of request doesn't - // has two writes to the same key. (Even with 2 different TS). That was true - // for normal cases because the latches reject concurrency write to keys. - // However we have bypassed the latch layer :( - header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); - cmd.set_header(header); - cmd.set_requests(reqs.drain().into()); - cmd -} - -// we need to remove duplicate keys in here, since -// in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 -// will panic if found duplicated entry during Vec. -fn build_apply_request<'a, 'b>( - req_size: &'a mut u64, - raft_size: u64, - reqs: &'a mut RequestCollector, - cmd_reqs: &'a mut Vec, - is_delete: bool, - cf: &'b str, - context: Context, -) -> Box, Vec) + 'b> -where - 'a: 'b, -{ - // use callback to collect kv data. - Box::new(move |k: Vec, v: Vec| { - // Need to skip the empty key/value that could break the transaction or cause - // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. - if k.is_empty() || (!is_delete && v.is_empty()) { - return; - } - - let mut req = Request::default(); - if is_delete { - let mut del = DeleteRequest::default(); - del.set_key(k); - del.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Delete); - req.set_delete(del); - } else { - if cf == CF_WRITE && !write_needs_restore(&v) { - return; - } - - let mut put = PutRequest::default(); - put.set_key(k); - put.set_value(v); - put.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Put); - req.set_put(put); - } - - // When the request size get grow to max request size, - // build the request and add it to a batch. - if *req_size + req.compute_size() as u64 > raft_size * 7 / 8 { - IMPORTER_APPLY_BYTES.observe(*req_size as _); - *req_size = 0; - let cmd = make_request(reqs, context.clone()); - cmd_reqs.push(cmd); - } - - *req_size += req.compute_size() as u64; - reqs.accept(req); - }) -} - fn write_needs_restore(write: &[u8]) -> bool { let w = WriteRef::parse(write); match w { @@ -1146,9 +1130,7 @@ mod test { use kvproto::{kvrpcpb::Context, raft_cmdpb::*}; use txn_types::{Key, TimeStamp, Write, WriteType}; - use crate::import::sst_service::{ - build_apply_request, key_from_request, make_request, RequestCollector, - }; + use crate::import::sst_service::{key_from_request, RequestCollector}; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1213,30 +1195,14 @@ mod test { } fn run_case(c: &Case) { - let mut cmds = vec![]; - let mut reqs = RequestCollector::from_cf(c.cf); - let mut req_size = 0_u64; - - let mut builder = build_apply_request( - &mut req_size, - 1024, - &mut reqs, - &mut cmds, - c.is_delete, - c.cf, - Context::new(), - ); + let mut collector = RequestCollector::new(Context::new(), 1024); for (k, v) in c.mutations.clone() { - builder(k, v); - } - drop(builder); - if !reqs.is_empty() { - let cmd = make_request(&mut reqs, Context::new()); - cmds.push(cmd); + collector.accept_kv(c.cf, c.is_delete, k, v); } + let reqs = collector.drain_raft_reqs(true); - let mut req1: HashMap<_, _> = cmds + let mut req1: HashMap<_, _> = reqs .into_iter() .flat_map(|mut x| x.take_requests().into_iter()) .map(|req| { @@ -1318,8 +1284,7 @@ mod test { #[test] fn test_request_collector_with_write_cf() { - let mut request_collector = RequestCollector::from_cf(CF_WRITE); - assert_eq!(request_collector.is_empty(), true); + let mut request_collector = RequestCollector::new(Context::new(), 102400); let reqs = vec![ write_req(b"foo", WriteType::Put, 40, 39), write_req(b"aar", WriteType::Put, 38, 37), @@ -1333,23 +1298,21 @@ mod test { ]; for req in reqs { - request_collector.accept(req); + request_collector.accept(CF_WRITE, req); } - assert_eq!(request_collector.is_empty(), false); - let mut reqs = request_collector.drain(); + let mut reqs: Vec<_> = request_collector.drain_unpacked_reqs(CF_WRITE); reqs.sort_by(|r1, r2| { let k1 = key_from_request(r1); let k2 = key_from_request(r2); k1.cmp(k2) }); assert_eq!(reqs, reqs_result); - assert_eq!(request_collector.is_empty(), true); + assert!(request_collector.is_empty()); } #[test] fn test_request_collector_with_default_cf() { - let mut request_collector = RequestCollector::from_cf(CF_DEFAULT); - assert_eq!(request_collector.is_empty(), true); + let mut request_collector = RequestCollector::new(Context::new(), 102400); let reqs = vec![ default_req(b"foo", b"", 39), default_req(b"zzz", b"", 40), @@ -1363,10 +1326,9 @@ mod test { ]; for req in reqs { - request_collector.accept(req); + request_collector.accept(CF_DEFAULT, req); } - assert_eq!(request_collector.is_empty(), false); - let mut reqs = request_collector.drain(); + let mut reqs: Vec<_> = request_collector.drain_unpacked_reqs(CF_DEFAULT); reqs.sort_by(|r1, r2| { let k1 = key_from_request(r1); let (k1, ts1) = Key::split_on_ts_for(k1).unwrap(); @@ -1376,6 +1338,6 @@ mod test { k1.cmp(k2).then(ts1.cmp(&ts2)) }); assert_eq!(reqs, reqs_result); - assert_eq!(request_collector.is_empty(), true); + assert!(request_collector.is_empty()); } } diff --git a/src/read_pool.rs b/src/read_pool.rs index 5212c4ae594..ea20b149a3d 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -11,6 +11,7 @@ use futures::{channel::oneshot, future::TryFutureExt}; use kvproto::kvrpcpb::CommandPri; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use prometheus::{IntCounter, IntGauge}; +use resource_control::{ControlledFuture, ResourceController}; use thiserror::Error; use tikv_util::{ sys::{cpu_time::ProcessStat, SysQuota}, @@ -52,6 +53,7 @@ pub enum ReadPool { running_threads: IntGauge, max_tasks: usize, pool_size: usize, + resource_ctl: Option>, }, } @@ -73,12 +75,14 @@ impl ReadPool { running_threads, max_tasks, pool_size, + resource_ctl, } => ReadPoolHandle::Yatp { remote: pool.remote().clone(), running_tasks: running_tasks.clone(), running_threads: running_threads.clone(), max_tasks: *max_tasks, pool_size: *pool_size, + resource_ctl: resource_ctl.clone(), }, } } @@ -97,11 +101,18 @@ pub enum ReadPoolHandle { running_threads: IntGauge, max_tasks: usize, pool_size: usize, + resource_ctl: Option>, }, } impl ReadPoolHandle { - pub fn spawn(&self, f: F, priority: CommandPri, task_id: u64) -> Result<(), ReadPoolError> + pub fn spawn( + &self, + f: F, + priority: CommandPri, + task_id: u64, + group_meta: Vec, + ) -> Result<(), ReadPoolError> where F: Future + Send + 'static, { @@ -123,6 +134,7 @@ impl ReadPoolHandle { remote, running_tasks, max_tasks, + resource_ctl, .. } => { let running_tasks = running_tasks.clone(); @@ -140,14 +152,29 @@ impl ReadPoolHandle { CommandPri::Normal => None, CommandPri::Low => Some(2), }; - let extras = Extras::new_multilevel(task_id, fixed_level); - let task_cell = TaskCell::new( - TrackedFuture::new(async move { - f.await; - running_tasks.dec(); - }), - extras, - ); + let mut extras = Extras::new_multilevel(task_id, fixed_level); + extras.set_metadata(group_meta.clone()); + let task_cell = if let Some(resource_ctl) = resource_ctl { + TaskCell::new( + TrackedFuture::new(ControlledFuture::new( + async move { + f.await; + running_tasks.dec(); + }, + resource_ctl.clone(), + group_meta, + )), + extras, + ) + } else { + TaskCell::new( + TrackedFuture::new(async move { + f.await; + running_tasks.dec(); + }), + extras, + ) + }; remote.spawn(task_cell); } } @@ -159,6 +186,7 @@ impl ReadPoolHandle { f: F, priority: CommandPri, task_id: u64, + group_meta: Vec, ) -> impl Future> where F: Future + Send + 'static, @@ -172,6 +200,7 @@ impl ReadPoolHandle { }, priority, task_id, + group_meta, ); async move { res?; @@ -262,11 +291,11 @@ pub fn build_yatp_read_pool( config: &UnifiedReadPoolConfig, reporter: R, engine: E, + resource_ctl: Option>, ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); - let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }); let raftkv = Arc::new(Mutex::new(engine)); - let pool = builder + let builder = YatpPoolBuilder::new(ReporterTicker { reporter }) .name_prefix(&unified_read_pool_name) .stack_size(config.stack_size.0 as usize) .thread_count( @@ -284,8 +313,12 @@ pub fn build_yatp_read_pool( }) .before_stop(|| unsafe { destroy_tls_engine::(); - }) - .build_multi_level_pool(); + }); + let pool = if let Some(ref r) = resource_ctl { + builder.build_priority_pool(r.clone()) + } else { + builder.build_multi_level_pool() + }; ReadPool::Yatp { pool, running_tasks: UNIFIED_READ_POOL_RUNNING_TASKS @@ -296,6 +329,7 @@ pub fn build_yatp_read_pool( .max_tasks_per_worker .saturating_mul(config.max_thread_count), pool_size: config.max_thread_count, + resource_ctl, } } @@ -600,7 +634,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -616,18 +650,18 @@ mod tests { let (task3, _tx3) = gen_task(); let (task4, _tx4) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } tx1.send(()).unwrap(); thread::sleep(Duration::from_millis(300)); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); } #[test] @@ -641,7 +675,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -658,11 +692,11 @@ mod tests { let (task4, _tx4) = gen_task(); let (task5, _tx5) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -670,10 +704,10 @@ mod tests { handle.scale_pool_size(3); assert_eq!(handle.get_normal_pool_size(), 3); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task5, CommandPri::Normal, 5) { + match handle.spawn(task5, CommandPri::Normal, 5, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -690,7 +724,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -707,11 +741,11 @@ mod tests { let (task4, _tx4) = gen_task(); let (task5, _tx5) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -723,10 +757,10 @@ mod tests { handle.scale_pool_size(1); assert_eq!(handle.get_normal_pool_size(), 1); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task5, CommandPri::Normal, 5) { + match handle.spawn(task5, CommandPri::Normal, 5, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 91b5178f8a0..ff06e41cc57 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -6,6 +6,7 @@ use engine_rocks::{ raw::{Cache, Env}, CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, RocksDbOptions, RocksEngine, RocksEventListener, RocksPersistenceListener, RocksStatistics, + TabletLogger, }; use engine_traits::{ CompactionJobInfo, MiscExt, PersistenceListener, Result, StateStorage, TabletContext, @@ -134,12 +135,12 @@ impl KvEngineFactory { self.inner.db_resources.statistics.clone() } - fn db_opts(&self) -> RocksDbOptions { + fn db_opts(&self, for_engine: EngineType) -> RocksDbOptions { // Create kv engine. let mut db_opts = self .inner .rocksdb_config - .build_opt(&self.inner.db_resources); + .build_opt(&self.inner.db_resources, for_engine); if !self.inner.lite { db_opts.add_event_listener(RocksEventListener::new( "kv", @@ -170,7 +171,7 @@ impl KvEngineFactory { /// It will always create in path/DEFAULT_DB_SUB_DIR. pub fn create_shared_db(&self, path: impl AsRef) -> Result { let path = path.as_ref(); - let mut db_opts = self.db_opts(); + let mut db_opts = self.db_opts(EngineType::RaftKv); let cf_opts = self.cf_opts(EngineType::RaftKv); if let Some(listener) = &self.inner.flow_listener { db_opts.add_event_listener(listener.clone()); @@ -187,7 +188,9 @@ impl KvEngineFactory { impl TabletFactory for KvEngineFactory { fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { - let mut db_opts = self.db_opts(); + let mut db_opts = self.db_opts(EngineType::RaftKv2); + let tablet_name = path.file_name().unwrap().to_str().unwrap().to_string(); + db_opts.set_info_log(TabletLogger::new(tablet_name)); let cf_opts = self.cf_opts(EngineType::RaftKv2); if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { db_opts.add_event_listener(listener.clone_with(ctx.id, suffix)); @@ -215,7 +218,7 @@ impl TabletFactory for KvEngineFactory { fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { info!("destroy tablet"; "path" => %path.display(), "id" => ctx.id, "suffix" => ?ctx.suffix); // Create kv engine. - let _db_opts = self.db_opts(); + let _db_opts = self.db_opts(EngineType::RaftKv2); let _cf_opts = self.cf_opts(EngineType::RaftKv2); // TODOTODO: call rust-rocks or tirocks to destroy_engine; // engine_rocks::util::destroy_engine( diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 23f8256835b..d35c58cbf34 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -207,6 +207,12 @@ lazy_static! { &["type"] ) .unwrap(); + pub static ref GRPC_RESOURCE_GROUP_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_grpc_resource_group_total", + "Total number of handle grpc message for each resource group", + &["name"] + ) + .unwrap(); pub static ref GRPC_PROXY_MSG_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_grpc_proxy_msg_total", "Total number of handle grpc proxy message", diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index c50c42c9fc6..751c07c6b65 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -44,14 +44,13 @@ use raftstore::{ errors::Error as RaftServerError, router::{LocalReadRouter, RaftStoreRouter}, store::{ - self, Callback as StoreCallback, RaftCmdExtraOpts, ReadIndexContext, ReadResponse, - RegionSnapshot, StoreMsg, WriteResponse, + self, util::encode_start_ts_into_flag_data, Callback as StoreCallback, RaftCmdExtraOpts, + ReadIndexContext, ReadResponse, RegionSnapshot, StoreMsg, WriteResponse, }, }; use thiserror::Error; use tikv_kv::{write_modifies, OnAppliedCb, WriteEvent}; use tikv_util::{ - codec::number::NumberEncoder, future::{paired_future_callback, paired_must_called_future_callback}, time::Instant, }; @@ -547,18 +546,21 @@ where let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { - let mut data = [0u8; 8]; - (&mut data[..]) - .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) - .unwrap(); + let need_encoded_start_ts = ctx.start_ts.map_or(true, |ts| !ts.is_zero()); + if ctx.pb_ctx.get_stale_read() && need_encoded_start_ts { flags |= WriteBatchFlags::STALE_READ.bits(); - header.set_flag_data(data.into()); } if ctx.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); + // Encode `start_ts` in `flag_data` for the check of stale read and flashback. + if need_encoded_start_ts { + encode_start_ts_into_flag_data( + &mut header, + ctx.start_ts.unwrap_or_default().into_inner(), + ); + } let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); @@ -637,13 +639,16 @@ where } } - fn start_flashback(&self, ctx: &Context) -> BoxFuture<'static, kv::Result<()>> { + fn start_flashback(&self, ctx: &Context, start_ts: u64) -> BoxFuture<'static, kv::Result<()>> { // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the // later flashback. Once invoked, we will update the persistent region meta and // the memory state of the flashback in Peer FSM to reject all read, write // and scheduling operations for this region when propose/apply before we // start the actual data flashback transaction command in the next phase. - let req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + let mut req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + req.mut_admin_request() + .mut_prepare_flashback() + .set_start_ts(start_ts); exec_admin(&*self.router, req) } diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 526a1fab3ca..28f2a1d5d25 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -15,7 +15,7 @@ use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; use futures::{Future, Stream, StreamExt}; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; pub use node::NodeV2; -use raftstore::store::RegionSnapshot; +use raftstore::store::{util::encode_start_ts_into_flag_data, RegionSnapshot}; use raftstore_v2::{ router::{ message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, @@ -23,7 +23,7 @@ use raftstore_v2::{ SimpleWriteBinary, SimpleWriteEncoder, }; use tikv_kv::{Modify, WriteEvent}; -use tikv_util::{codec::number::NumberEncoder, time::Instant}; +use tikv_util::time::Instant; use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::{ @@ -153,18 +153,21 @@ impl tikv_kv::Engine for RaftKv2 { let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { - let mut data = [0u8; 8]; - (&mut data[..]) - .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) - .unwrap(); + let need_encoded_start_ts = ctx.start_ts.map_or(true, |ts| !ts.is_zero()); + if ctx.pb_ctx.get_stale_read() && need_encoded_start_ts { flags |= WriteBatchFlags::STALE_READ.bits(); - header.set_flag_data(data.into()); } if ctx.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); + // Encode `start_ts` in `flag_data` for the check of stale read and flashback. + if need_encoded_start_ts { + encode_start_ts_into_flag_data( + &mut header, + ctx.start_ts.unwrap_or_default().into_inner(), + ); + } let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index ed6f16e8bec..588e8ae9e9b 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -9,9 +9,13 @@ use kvproto::{metapb, replication_modepb::ReplicationStatus}; use pd_client::PdClient; use raftstore::{ coprocessor::CoprocessorHost, - store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}, + store::{ + AutoSplitController, GlobalReplicationState, TabletSnapManager, Transport, + RAFT_INIT_LOG_INDEX, + }, }; -use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreSystem}; +use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreRouter, StoreSystem}; +use resource_metering::CollectorRegHandle; use slog::{info, o, Logger}; use tikv_util::{ config::VersionTrack, @@ -24,11 +28,10 @@ use crate::server::{node::init_store, Result}; pub struct NodeV2 { cluster_id: u64, store: metapb::Store, - system: Option<(RaftRouter, StoreSystem)>, + system: Option<(StoreRouter, StoreSystem)>, has_started: bool, pd_client: Arc, - registry: TabletRegistry, logger: Logger, } @@ -43,7 +46,6 @@ where cfg: &crate::server::Config, pd_client: Arc, store: Option, - registry: TabletRegistry, ) -> NodeV2 { let store = init_store(store, cfg); @@ -53,7 +55,6 @@ where pd_client, system: None, has_started: false, - registry, logger: slog_global::borrow_global().new(o!()), } } @@ -71,16 +72,14 @@ where ) .bootstrap_store()?; self.store.set_id(store_id); + let (router, system) = raftstore_v2::create_store_batch_system(cfg, store_id, self.logger.clone()); - self.system = Some(( - RaftRouter::new(store_id, self.registry.clone(), router), - system, - )); + self.system = Some((router, system)); Ok(()) } - pub fn router(&self) -> &RaftRouter { + pub fn router(&self) -> &StoreRouter { &self.system.as_ref().unwrap().0 } @@ -90,11 +89,15 @@ where pub fn start( &mut self, raft_engine: ER, + registry: TabletRegistry, + router: &RaftRouter, trans: T, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, store_cfg: Arc>, @@ -112,15 +115,10 @@ where ) .bootstrap_first_region(&self.store, store_id)? { - let path = self - .registry - .tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + let path = registry.tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); let ctx = TabletContext::new(®ion, Some(RAFT_INIT_LOG_INDEX)); // TODO: make follow line can recover from abort. - self.registry - .tablet_factory() - .open_tablet(ctx, &path) - .unwrap(); + registry.tablet_factory().open_tablet(ctx, &path).unwrap(); } // Put store only if the cluster is bootstrapped. @@ -130,11 +128,15 @@ where self.start_store( raft_engine, + registry, + router, trans, snap_mgr, concurrency_manager, causal_ts_provider, coprocessor_host, + auto_split_controller, + collector_reg_handle, background, pd_worker, store_cfg, @@ -187,11 +189,15 @@ where fn start_store( &mut self, raft_engine: ER, + registry: TabletRegistry, + router: &RaftRouter, trans: T, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, store_cfg: Arc>, @@ -207,13 +213,13 @@ where } self.has_started = true; - let (router, system) = self.system.as_mut().unwrap(); + let system = &mut self.system.as_mut().unwrap().1; system.start( store_id, store_cfg, raft_engine, - self.registry.clone(), + registry, trans, self.pd_client.clone(), router.store_router(), @@ -222,6 +228,8 @@ where concurrency_manager, causal_ts_provider, coprocessor_host, + auto_split_controller, + collector_reg_handle, background, pd_worker, )?; diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index 6e9585ab2c9..8a84eaf6293 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -3,10 +3,7 @@ use std::{collections::HashMap, string::ToString}; use kvproto::diagnosticspb::{ServerInfoItem, ServerInfoPair}; -use tikv_util::{ - config::KIB, - sys::{cpu_time::LinuxStyleCpuTime, ioload, SysQuota, *}, -}; +use tikv_util::sys::{cpu_time::LinuxStyleCpuTime, ioload, SysQuota, *}; use walkdir::WalkDir; use crate::server::service::diagnostics::SYS_INFO; @@ -129,12 +126,12 @@ fn cpu_load_info(prev_cpu: CpuTimeSnapshot, collector: &mut Vec) fn mem_load_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_memory(); - let total_memory = system.total_memory() * KIB; - let used_memory = system.used_memory() * KIB; - let free_memory = system.free_memory() * KIB; - let total_swap = system.total_swap() * KIB; - let used_swap = system.used_swap() * KIB; - let free_swap = system.free_swap() * KIB; + let total_memory = system.total_memory(); + let used_memory = system.used_memory(); + let free_memory = system.free_memory(); + let total_swap = system.total_swap(); + let used_swap = system.used_swap(); + let free_swap = system.free_swap(); drop(system); let used_memory_pct = (used_memory as f64) / (total_memory as f64); let free_memory_pct = (free_memory as f64) / (total_memory as f64); @@ -683,6 +680,50 @@ mod tests { assert_ne!(processes.get_pairs().len(), 0); } + #[test] + #[cfg(target_os = "linux")] + fn test_memory() { + let mut mem_total_kb: u64 = 0; + { + use std::io::BufRead; + + let f = std::fs::File::open("/proc/meminfo").unwrap(); + let reader = std::io::BufReader::new(f); + for line in reader.lines() { + let l = line.unwrap(); + let mut parts = l.split_whitespace(); + if parts.next().unwrap() != "MemTotal:" { + continue; + } + mem_total_kb = parts.next().unwrap().parse().unwrap(); + let unit = parts.next().unwrap(); + assert_eq!(unit, "kB"); + } + } + assert!(mem_total_kb > 0); + + let mut collector = vec![]; + hardware_info(&mut collector); + + let mut memory_checked = false; + + 'outer: for item in &collector { + if item.get_tp() != "memory" { + continue; + } + for pair in item.get_pairs() { + if pair.get_key() != "capacity" { + continue; + } + assert_eq!(pair.get_value(), (mem_total_kb * 1024).to_string()); + memory_checked = true; + break 'outer; + } + } + + assert!(memory_checked); + } + #[test] fn test_hardware_info() { let mut collector = vec![]; diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 6c85741f64a..da292eca17d 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -171,6 +171,10 @@ macro_rules! handle_request { let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let resp = $future_name(&self.storage, req); let task = async move { let resp = resp.await?; @@ -1043,6 +1047,10 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default()); }, Some(batch_commands_request::request::Cmd::Get(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_get(&req) }) { @@ -1057,6 +1065,10 @@ fn handle_batch_commands_request( } }, Some(batch_commands_request::request::Cmd::RawGet(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_raw_get(&req) }) { @@ -1071,6 +1083,10 @@ fn handle_batch_commands_request( } }, Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = future_copr(copr, Some(peer.to_string()), req) @@ -1098,6 +1114,10 @@ fn handle_batch_commands_request( ); } $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = $future_fn($($arg,)* req) @@ -1430,7 +1450,9 @@ fn future_prepare_flashback_to_version( ) -> impl Future> { let storage = storage.clone(); async move { - let f = storage.get_engine().start_flashback(req.get_context()); + let f = storage + .get_engine() + .start_flashback(req.get_context(), req.get_start_ts()); let mut res = f.await.map_err(storage::Error::from); if matches!(res, Ok(())) { // After the region is put into the flashback state, we need to do a special @@ -1468,10 +1490,7 @@ fn future_flashback_to_version( res = f.await.unwrap_or_else(|e| Err(box_err!(e))); } if matches!(res, Ok(())) { - // Only finish flashback when Flashback executed successfully. - fail_point!("skip_finish_flashback_to_version", |_| { - Ok(FlashbackToVersionResponse::default()) - }); + // Only finish when flashback executed successfully. let f = storage.get_engine().end_flashback(req.get_context()); res = f.await.map_err(storage::Error::from); } diff --git a/src/storage/config.rs b/src/storage/config.rs index 68d739c1639..d74bd721104 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -35,6 +35,7 @@ const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; #[serde(rename_all = "kebab-case")] pub enum EngineType { RaftKv, + #[serde(alias = "partitioned-raft-kv")] RaftKv2, } diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 080ff2c5951..4837567ee43 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -11,7 +11,7 @@ use pd_client::BucketMeta; use prometheus::*; use prometheus_static_metric::*; use raftstore::store::{util::build_key_range, ReadStats}; -use tikv_kv::{with_tls_engine, Engine}; +use tikv_kv::Engine; use tracker::get_tls_tracker_token; use crate::{ @@ -347,23 +347,15 @@ where }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - if c.is_none() { - *c = with_tls_engine(|engine: &mut E| { - engine.kv_engine().map(|c| { - Box::new(c.get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Storage(cmd.get_str()), - )) as Box - }) - }); - }; - if let Some(c) = &mut *c { - c.start_observe(); - } + let perf_context = c.get_or_insert_with(|| { + Box::new(E::Local::get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Storage(cmd.get_str()), + )) as Box + }); + perf_context.start_observe(); let res = f(); - if let Some(c) = &mut *c { - c.report_metrics(&[get_tls_tracker_token()]); - } + perf_context.report_metrics(&[get_tls_tracker_token()]); res }) } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 802b0507849..7429ed8900b 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -89,6 +89,7 @@ use kvproto::{ use pd_client::FeatureGate; use raftstore::store::{util::build_key_range, ReadStats, TxnExt, WriteStats}; use rand::prelude::*; +use resource_control::ResourceController; use resource_metering::{FutureExt, ResourceTagFactory}; use tikv_kv::{OnAppliedCb, SnapshotExt}; use tikv_util::{ @@ -129,7 +130,7 @@ use crate::{ txn::{ commands::{RawAtomicStore, RawCompareAndSwap, TypedCommand}, flow_controller::{EngineFlowController, FlowController}, - scheduler::Scheduler as TxnScheduler, + scheduler::TxnScheduler, Command, ErrorInner as TxnError, }, types::StorageCallbackType, @@ -270,6 +271,7 @@ impl Storage { quota_limiter: Arc, feature_gate: FeatureGate, causal_ts_provider: Option>, + resource_ctl: Option>, ) -> Result { assert_eq!(config.api_version(), F::TAG, "Api version not match"); @@ -285,6 +287,7 @@ impl Storage { resource_tag_factory.clone(), Arc::clone("a_limiter), feature_gate, + resource_ctl, ); info!("Storage started."); @@ -594,6 +597,7 @@ impl Storage { let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -727,6 +731,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -750,6 +755,11 @@ impl Storage { const CMD: CommandKind = CommandKind::batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read let priority = requests[0].get_context().get_priority(); + let group_name = requests[0] + .get_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; @@ -910,6 +920,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -929,6 +940,7 @@ impl Storage { let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::batch_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = keys .iter() @@ -1082,6 +1094,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1109,6 +1122,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -1258,6 +1272,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1276,6 +1291,7 @@ impl Storage { ) -> impl Future>> { const CMD: CommandKind = CommandKind::scan_lock; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -1405,6 +1421,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -1495,15 +1512,20 @@ impl Storage { // Schedule raw modify commands, which reuse the scheduler worker pool. // TODO: separate the txn and raw commands if needed in the future. - fn sched_raw_command(&self, tag: CommandKind, future: T) -> Result<()> + fn sched_raw_command( + &self, + group_name: &str, + pri: CommandPri, + tag: CommandKind, + future: T, + ) -> Result<()> where - T: Future + Send + 'static, + T: Future + Send + 'static, { SCHED_STAGE_COUNTER_VEC.get(tag).new.inc(); self.sched - .get_sched_pool(CommandPri::Normal) - .pool - .spawn(future) + .get_sched_pool() + .spawn(group_name, pri, future) .map_err(|_| Error::from(ErrorInner::SchedTooBusy)) } @@ -1577,6 +1599,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self .resource_tag_factory @@ -1639,6 +1662,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1657,6 +1681,11 @@ impl Storage { const CMD: CommandKind = CommandKind::raw_batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read let priority = gets[0].get_context().get_priority(); + let group_name = gets[0] + .get_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let api_version = self.api_version; @@ -1770,6 +1799,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -1786,6 +1816,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_batch_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = keys.iter().map(|k| (k.clone(), k.clone())).collect(); let resource_tag = self @@ -1866,6 +1897,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1931,7 +1963,10 @@ impl Storage { let provider = self.causal_ts_provider.clone(); let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); - self.sched_raw_command(CMD, async move { + + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2041,7 +2076,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2104,7 +2141,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2163,7 +2202,9 @@ impl Storage { let cf = Self::rawkv_cf(&cf, self.api_version)?; let engine = self.engine.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2209,7 +2250,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2272,6 +2315,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag(&ctx); let api_version = self.api_version; @@ -2380,6 +2424,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2400,6 +2445,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_batch_scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = ranges .iter() @@ -2536,6 +2582,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2553,6 +2600,7 @@ impl Storage { ) -> impl Future>> { const CMD: CommandKind = CommandKind::raw_get_key_ttl; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self .resource_tag_factory @@ -2615,6 +2663,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2642,7 +2691,9 @@ impl Storage { return Err(Error::from(ErrorInner::TtlNotEnabled)); } let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { let key = F::encode_raw_key_owned(key, None); let cmd = RawCompareAndSwap::new(cf, key, previous_value, value, ttl, api_version, ctx); Self::sched_raw_atomic_command( @@ -2673,7 +2724,9 @@ impl Storage { Self::check_ttl_valid(pairs.len(), &ttls)?; let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls, None); let cmd = RawAtomicStore::new(cf, modifies, ctx); Self::sched_raw_atomic_command( @@ -2696,7 +2749,9 @@ impl Storage { Self::check_api_version(self.api_version, ctx.api_version, CMD, &keys)?; let cf = Self::rawkv_cf(&cf, self.api_version)?; let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { // Do NOT encode ts here as RawAtomicStore use key to gen lock let modifies = keys .into_iter() @@ -2719,6 +2774,7 @@ impl Storage { ) -> impl Future> { const CMD: CommandKind = CommandKind::raw_checksum; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = ranges .iter() @@ -2793,6 +2849,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -3151,6 +3208,7 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), ts_provider, + None, ) } @@ -3181,6 +3239,7 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), None, + Some(Arc::new(ResourceController::new("test".to_owned(), false))), ) } } diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index fcffd500c8e..86b9ddeab41 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -142,10 +142,22 @@ pub fn acquire_pessimistic_lock( None }; - if need_load_value { - val = reader.get(&key, for_update_ts)?; - } else if need_check_existence { - val = reader.get_write(&key, for_update_ts)?.map(|_| vec![]); + if need_load_value || need_check_existence || should_not_exist { + let write = reader.get_write_with_commit_ts(&key, for_update_ts)?; + if let Some((write, commit_ts)) = write { + // Here `get_write_with_commit_ts` returns only the latest PUT if it exists and + // is not deleted. It's still ok to pass it into `check_data_constraint`. + // In case we are going to lock it with write conflict, we do not check it since + // the statement will then retry. + if locked_with_conflict_ts.is_none() { + check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; + } + if need_load_value { + val = Some(reader.load_data(&key, write)?); + } else if need_check_existence { + val = Some(vec![]); + } + } } // Pervious write is not loaded. let (prev_write_loaded, prev_write) = (false, None); @@ -1832,4 +1844,134 @@ pub mod tests { must_pessimistic_rollback(&mut engine, b"k1", 10, 50); must_unlocked(&mut engine, b"k1"); } + + #[test] + fn test_repeated_request_check_should_not_exist() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + for &(return_values, check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + let key = &[b'k', (return_values as u8 * 2) + check_existence as u8] as &[u8]; + + // An empty key. + must_succeed(&mut engine, key, key, 10, 10); + let res = must_succeed_impl( + &mut engine, + key, + key, + 10, + true, + 1000, + 10, + return_values, + check_existence, + 15, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 10, 10, DoPessimisticCheck); + must_commit(&mut engine, key, 10, 19); + + // The key has one record: Lock(10, 19) + must_succeed(&mut engine, key, key, 20, 20); + let res = must_succeed_impl( + &mut engine, + key, + key, + 20, + true, + 1000, + 20, + return_values, + check_existence, + 25, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_put(&mut engine, key, b"v1", key, 20, 20, DoPessimisticCheck); + must_commit(&mut engine, key, 20, 29); + + // The key has records: + // Lock(10, 19), Put(20, 29) + must_succeed(&mut engine, key, key, 30, 30); + let error = must_err_impl( + &mut engine, + key, + key, + 30, + true, + 30, + return_values, + check_existence, + 35, + false, + ); + assert!(matches!( + error, + MvccError(box ErrorInner::AlreadyExist { .. }) + )); + must_pessimistic_prewrite_lock(&mut engine, key, key, 30, 30, DoPessimisticCheck); + must_commit(&mut engine, key, 30, 39); + + // Lock(10, 19), Put(20, 29), Lock(30, 39) + must_succeed(&mut engine, key, key, 40, 40); + let error = must_err_impl( + &mut engine, + key, + key, + 40, + true, + 40, + return_values, + check_existence, + 45, + false, + ); + assert!(matches!( + error, + MvccError(box ErrorInner::AlreadyExist { .. }) + )); + must_pessimistic_prewrite_delete(&mut engine, key, key, 40, 40, DoPessimisticCheck); + must_commit(&mut engine, key, 40, 49); + + // Lock(10, 19), Put(20, 29), Lock(30, 39), Delete(40, 49) + must_succeed(&mut engine, key, key, 50, 50); + let res = must_succeed_impl( + &mut engine, + key, + key, + 50, + true, + 1000, + 50, + return_values, + check_existence, + 55, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 50, 50, DoPessimisticCheck); + must_commit(&mut engine, key, 50, 59); + + // Lock(10, 19), Put(20, 29), Lock(30, 39), Delete(40, 49), Lock(50, 59) + must_succeed(&mut engine, key, key, 60, 60); + let res = must_succeed_impl( + &mut engine, + key, + key, + 60, + true, + 1000, + 60, + return_values, + check_existence, + 65, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 60, 60, DoPessimisticCheck); + must_commit(&mut engine, key, 60, 69); + } + } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 2d79ebc97cc..5b94ea5bd85 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -715,6 +715,13 @@ impl Command { self.command_ext().get_ctx().get_priority() } + pub fn group_name(&self) -> String { + self.command_ext() + .get_ctx() + .get_resource_group_name() + .to_owned() + } + pub fn need_flow_control(&self) -> bool { !self.readonly() && self.priority() != CommandPri::High } diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index f6884b0efb8..d3b199208cb 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -32,7 +32,7 @@ pub use self::{ }, commands::{Command, RESOLVE_LOCK_BATCH_SIZE}, latch::{Latches, Lock}, - scheduler::Scheduler, + scheduler::TxnScheduler, store::{ EntryBatch, FixtureStore, FixtureStoreScanner, Scanner, SnapshotStore, Store, TxnEntry, TxnEntryScanner, TxnEntryStore, diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index c7c69b5bbf4..0cff9d51d41 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -8,14 +8,16 @@ use std::{ use collections::HashMap; use file_system::{set_io_type, IoType}; -use kvproto::pdpb::QueryKind; +use kvproto::{kvrpcpb::CommandPri, pdpb::QueryKind}; use pd_client::{Feature, FeatureGate}; use prometheus::local::*; use raftstore::store::WriteStats; +use resource_control::{ControlledFuture, ResourceController}; use tikv_util::{ sys::SysQuota, - yatp_pool::{FuturePool, PoolTicker, YatpPoolBuilder}, + yatp_pool::{Full, FuturePool, PoolTicker, YatpPoolBuilder}, }; +use yatp::queue::Extras; use crate::storage::{ kv::{destroy_tls_engine, set_tls_engine, Engine, FlowStatsReporter, Statistics}, @@ -41,11 +43,6 @@ thread_local! { static TLS_FEATURE_GATE: RefCell = RefCell::new(latest_feature_gate()); } -#[derive(Clone)] -pub struct SchedPool { - pub pool: FuturePool, -} - #[derive(Clone)] pub struct SchedTicker { reporter: R, @@ -57,38 +54,142 @@ impl PoolTicker for SchedTicker { } } +#[derive(Clone)] +pub enum SchedPool { + // separated thread pools for different priority commands + Vanilla { + high_worker_pool: FuturePool, + worker_pool: FuturePool, + }, + // one priority based thread pool to handle all commands + Priority { + worker_pool: FuturePool, + resource_ctl: Arc, + }, +} + impl SchedPool { pub fn new( engine: E, pool_size: usize, reporter: R, feature_gate: FeatureGate, - name_prefix: &str, + resource_ctl: Option>, ) -> Self { - let engine = Arc::new(Mutex::new(engine)); - // for low cpu quota env, set the max-thread-count as 4 to allow potential cases - // that we need more thread than cpu num. - let max_pool_size = std::cmp::max( - pool_size, - std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), - ); - let pool = YatpPoolBuilder::new(SchedTicker {reporter:reporter.clone()}) - .thread_count(1, pool_size, max_pool_size) - .name_prefix(name_prefix) - // Safety: by setting `after_start` and `before_stop`, `FuturePool` ensures - // the tls_engine invariants. - .after_start(move || { - set_tls_engine(engine.lock().unwrap().clone()); - set_io_type(IoType::ForegroundWrite); - TLS_FEATURE_GATE.with(|c| *c.borrow_mut() = feature_gate.clone()); - }) - .before_stop(move || unsafe { - // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. - destroy_tls_engine::(); - tls_flush(&reporter); - }) - .build_future_pool(); - SchedPool { pool } + let builder = |pool_size: usize, name_prefix: &str| { + let engine = Arc::new(Mutex::new(engine.clone())); + let feature_gate = feature_gate.clone(); + let reporter = reporter.clone(); + // for low cpu quota env, set the max-thread-count as 4 to allow potential cases + // that we need more thread than cpu num. + let max_pool_size = std::cmp::max( + pool_size, + std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), + ); + YatpPoolBuilder::new(SchedTicker {reporter:reporter.clone()}) + .thread_count(1, pool_size, max_pool_size) + .name_prefix(name_prefix) + // Safety: by setting `after_start` and `before_stop`, `FuturePool` ensures + // the tls_engine invariants. + .after_start(move || { + set_tls_engine(engine.lock().unwrap().clone()); + set_io_type(IoType::ForegroundWrite); + TLS_FEATURE_GATE.with(|c| *c.borrow_mut() = feature_gate.clone()); + }) + .before_stop(move || unsafe { + // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. + destroy_tls_engine::(); + tls_flush(&reporter); + }) + }; + if let Some(ref r) = resource_ctl { + SchedPool::Priority { + worker_pool: builder(pool_size, "sched-worker-pool") + .build_priority_future_pool(r.clone()), + resource_ctl: r.clone(), + } + } else { + SchedPool::Vanilla { + worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), + high_worker_pool: builder(std::cmp::max(1, pool_size / 2), "sched-high-pri-pool") + .build_future_pool(), + } + } + } + + pub fn spawn( + &self, + group_name: &str, + priority: CommandPri, + f: impl futures::Future + Send + 'static, + ) -> Result<(), Full> { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + if priority == CommandPri::High { + high_worker_pool.spawn(f) + } else { + worker_pool.spawn(f) + } + } + SchedPool::Priority { + worker_pool, + resource_ctl, + } => { + let fixed_level = match priority { + CommandPri::High => Some(0), + CommandPri::Normal => None, + CommandPri::Low => Some(2), + }; + // TODO: maybe use a better way to generate task_id + let task_id = rand::random::(); + let mut extras = Extras::new_multilevel(task_id, fixed_level); + extras.set_metadata(group_name.as_bytes().to_owned()); + worker_pool.spawn_with_extras( + ControlledFuture::new( + async move { + f.await; + }, + resource_ctl.clone(), + group_name.as_bytes().to_owned(), + ), + extras, + ) + } + } + } + + pub fn scale_pool_size(&self, pool_size: usize) { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + high_worker_pool.scale_pool_size(std::cmp::max(1, pool_size / 2)); + worker_pool.scale_pool_size(pool_size); + } + SchedPool::Priority { worker_pool, .. } => { + worker_pool.scale_pool_size(pool_size); + } + } + } + + pub fn get_pool_size(&self, priority: CommandPri) -> usize { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + if priority == CommandPri::High { + high_worker_pool.get_pool_size() + } else { + worker_pool.get_pool_size() + } + } + SchedPool::Priority { worker_pool, .. } => worker_pool.get_pool_size(), + } } } diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index d96e3e7c97f..17110a07e7b 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath -//! Scheduler which schedules the execution of `storage::Command`s. +//! TxnScheduler which schedules the execution of `storage::Command`s. //! //! There is one scheduler for each store. It receives commands from clients, //! executes them against the MVCC layer storage engine. @@ -12,16 +12,16 @@ //! leader. When the client read or write a row, the command is sent to the //! scheduler which is on the region leader's store. //! -//! Scheduler runs in a single-thread event loop, but command executions are +//! TxnScheduler runs in a single-thread event loop, but command executions are //! delegated to a pool of worker thread. //! -//! Scheduler keeps track of all the running commands and uses latches to ensure -//! serialized access to the overlapping rows involved in concurrent commands. -//! But note that scheduler only ensures serialized access to the overlapping -//! rows at command level, but a transaction may consist of multiple commands, -//! therefore conflicts may happen at transaction level. Transaction semantics -//! is ensured by the transaction protocol implemented in the client library, -//! which is transparent to the scheduler. +//! TxnScheduler keeps track of all the running commands and uses latches to +//! ensure serialized access to the overlapping rows involved in concurrent +//! commands. But note that scheduler only ensures serialized access to the +//! overlapping rows at command level, but a transaction may consist of multiple +//! commands, therefore conflicts may happen at transaction level. Transaction +//! semantics is ensured by the transaction protocol implemented in the client +//! library, which is transparent to the scheduler. use std::{ marker::PhantomData, @@ -47,12 +47,11 @@ use kvproto::{ use parking_lot::{Mutex, MutexGuard, RwLockWriteGuard}; use pd_client::{Feature, FeatureGate}; use raftstore::store::TxnExt; +use resource_control::ResourceController; use resource_metering::{FutureExt, ResourceTagFactory}; use smallvec::{smallvec, SmallVec}; use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData, WriteEvent}; -use tikv_util::{ - deadline::Deadline, quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE, -}; +use tikv_util::{quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE}; use tracker::{get_tls_tracker_token, set_tls_tracker_token, TrackerToken}; use txn_types::TimeStamp; @@ -239,7 +238,7 @@ impl SchedulerTaskCallback { } } -struct SchedulerInner { +struct TxnSchedulerInner { // slot_id -> { cid -> `TaskContext` } in the slot. task_slots: Vec>>>, @@ -251,11 +250,8 @@ struct SchedulerInner { sched_pending_write_threshold: usize, - // worker pool - worker_pool: SchedPool, - - // high priority commands and system commands will be delivered to this pool - high_priority_pool: SchedPool, + // all tasks are executed in this pool + sched_worker_pool: SchedPool, // used to control write flow running_write_bytes: CachePadded, @@ -292,7 +288,7 @@ fn id_index(cid: u64) -> usize { cid as usize % TASKS_SLOTS_NUM } -impl SchedulerInner { +impl TxnSchedulerInner { /// Generates the next command ID. #[inline] fn gen_id(&self) -> u64 { @@ -375,19 +371,23 @@ impl SchedulerInner { /// /// Returns a deadline error if the deadline is exceeded. Returns the `Task` /// if all latches are acquired, returns `None` otherwise. - fn acquire_lock_on_wakeup(&self, cid: u64) -> Result, StorageError> { + fn acquire_lock_on_wakeup( + &self, + cid: u64, + ) -> Result, (String, CommandPri, StorageError)> { let mut task_slot = self.get_task_slot(cid); let tctx = task_slot.get_mut(&cid).unwrap(); // Check deadline early during acquiring latches to avoid expired requests // blocking other requests. - if let Err(e) = tctx.task.as_ref().unwrap().cmd.deadline().check() { + let cmd = &tctx.task.as_ref().unwrap().cmd; + if let Err(e) = cmd.deadline().check() { // `acquire_lock_on_wakeup` is called when another command releases its locks // and wakes up command `cid`. This command inserted its lock before // and now the lock is at the front of the queue. The actual // acquired count is one more than the `owned_count` recorded in the // lock, so we increase one to make `release` work. tctx.lock.owned_count += 1; - return Err(e.into()); + return Err((cmd.group_name(), cmd.priority(), e.into())); } if self.latches.acquire(&mut tctx.lock, cid) { tctx.on_schedule(); @@ -401,25 +401,22 @@ impl SchedulerInner { } fn scale_pool_size(&self, pool_size: usize) { - self.worker_pool.pool.scale_pool_size(pool_size); - self.high_priority_pool - .pool - .scale_pool_size(std::cmp::max(1, pool_size / 2)); + self.sched_worker_pool.scale_pool_size(pool_size); } } -/// Scheduler which schedules the execution of `storage::Command`s. +/// TxnScheduler which schedules the execution of `storage::Command`s. #[derive(Clone)] -pub struct Scheduler { - inner: Arc>, +pub struct TxnScheduler { + inner: Arc>, // The engine can be fetched from the thread local storage of scheduler threads. // So, we don't store the engine here. _engine: PhantomData, } -unsafe impl Send for Scheduler {} +unsafe impl Send for TxnScheduler {} -impl Scheduler { +impl TxnScheduler { /// Creates a scheduler. pub(in crate::storage) fn new( engine: E, @@ -433,6 +430,7 @@ impl Scheduler { resource_tag_factory: ResourceTagFactory, quota_limiter: Arc, feature_gate: FeatureGate, + resource_ctl: Option>, ) -> Self { let t = Instant::now_coarse(); let mut task_slots = Vec::with_capacity(TASKS_SLOTS_NUM); @@ -442,25 +440,18 @@ impl Scheduler { let lock_wait_queues = LockWaitQueues::new(lock_mgr.clone()); - let inner = Arc::new(SchedulerInner { + let inner = Arc::new(TxnSchedulerInner { task_slots, id_alloc: AtomicU64::new(0).into(), latches: Latches::new(config.scheduler_concurrency), running_write_bytes: AtomicUsize::new(0).into(), sched_pending_write_threshold: config.scheduler_pending_write_threshold.0 as usize, - worker_pool: SchedPool::new( - engine.clone(), - config.scheduler_worker_pool_size, - reporter.clone(), - feature_gate.clone(), - "sched-worker-pool", - ), - high_priority_pool: SchedPool::new( + sched_worker_pool: SchedPool::new( engine, - std::cmp::max(1, config.scheduler_worker_pool_size / 2), + config.scheduler_worker_pool_size, reporter, feature_gate.clone(), - "sched-high-pri-pool", + resource_ctl, ), control_mutex: Arc::new(tokio::sync::Mutex::new(false)), lock_mgr, @@ -481,7 +472,7 @@ impl Scheduler { t.saturating_elapsed(), "initialized the transaction scheduler" ); - Scheduler { + TxnScheduler { inner, _engine: PhantomData, } @@ -561,26 +552,19 @@ impl Scheduler { return; } let task = tctx.task.as_ref().unwrap(); - let deadline = task.cmd.deadline(); - let cmd_ctx = task.cmd.ctx().clone(); - self.fail_fast_or_check_deadline(cid, tag, cmd_ctx, deadline); + self.fail_fast_or_check_deadline(cid, &task.cmd); fail_point!("txn_scheduler_acquire_fail"); } - fn fail_fast_or_check_deadline( - &self, - cid: u64, - tag: CommandKind, - cmd_ctx: Context, - deadline: Deadline, - ) { + fn fail_fast_or_check_deadline(&self, cid: u64, cmd: &Command) { + let tag = cmd.tag(); + let ctx = cmd.ctx().clone(); + let deadline = cmd.deadline(); let sched = self.clone(); - self.inner - .high_priority_pool - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(&cmd.group_name(), cmd.priority(), async move { match unsafe { - with_tls_engine(|engine: &mut E| engine.precheck_write_with_ctx(&cmd_ctx)) + with_tls_engine(|engine: &mut E| engine.precheck_write_with_ctx(&ctx)) } { // Precheck failed, try to return err early. Err(e) => { @@ -632,14 +616,12 @@ impl Scheduler { self.execute(task); } Ok(None) => {} - Err(err) => { + Err((group_name, pri, err)) => { // Spawn the finish task to the pool to avoid stack overflow // when many queuing tasks fail successively. let this = self.clone(); - self.inner - .worker_pool - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(&group_name, pri, async move { this.finish_with_err(cid, err); }) .unwrap(); @@ -670,21 +652,17 @@ impl Scheduler { } // pub for test - pub fn get_sched_pool(&self, priority: CommandPri) -> &SchedPool { - if priority == CommandPri::High { - &self.inner.high_priority_pool - } else { - &self.inner.worker_pool - } + pub fn get_sched_pool(&self) -> &SchedPool { + &self.inner.sched_worker_pool } /// Executes the task in the sched pool. fn execute(&self, mut task: Task) { set_tls_tracker_token(task.tracker); let sched = self.clone(); - self.get_sched_pool(task.cmd.priority()) - .pool - .spawn(async move { + + self.get_sched_pool() + .spawn(&task.cmd.group_name(), task.cmd.priority(), async move { fail_point!("scheduler_start_execute"); if sched.check_task_deadline_exceeded(&task) { return; @@ -800,6 +778,7 @@ impl Scheduler { async_apply_prewrite: bool, new_acquired_locks: Vec, tag: CommandKind, + group_name: &str, ) { // TODO: Does async apply prewrite worth a special metric here? if pipelined { @@ -847,7 +826,7 @@ impl Scheduler { assert!(pipelined || async_apply_prewrite); } - self.on_acquired_locks_finished(new_acquired_locks); + self.on_acquired_locks_finished(group_name, new_acquired_locks); if do_wake_up { let woken_up_resumable_lock_requests = tctx.woken_up_resumable_lock_requests; @@ -932,7 +911,11 @@ impl Scheduler { ); } - fn on_release_locks(&self, released_locks: ReleasedLocks) -> SVec> { + fn on_release_locks( + &self, + group_name: &str, + released_locks: ReleasedLocks, + ) -> SVec> { // This function is always called when holding the latch of the involved keys. // So if we found the lock waiting queues are empty, there's no chance // that other threads/commands adds new lock-wait entries to the keys @@ -973,13 +956,21 @@ impl Scheduler { }); if !legacy_wake_up_list.is_empty() || !delay_wake_up_futures.is_empty() { - self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); + self.wake_up_legacy_pessimistic_locks( + group_name, + legacy_wake_up_list, + delay_wake_up_futures, + ); } resumable_wake_up_list } - fn on_acquired_locks_finished(&self, new_acquired_locks: Vec) { + fn on_acquired_locks_finished( + &self, + group_name: &str, + new_acquired_locks: Vec, + ) { if new_acquired_locks.is_empty() || self.inner.lock_wait_queues.is_empty() { return; } @@ -992,9 +983,8 @@ impl Scheduler { .update_lock_wait(new_acquired_locks); } else { let lock_wait_queues = self.inner.lock_wait_queues.clone(); - self.get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(group_name, CommandPri::High, async move { lock_wait_queues.update_lock_wait(new_acquired_locks); }) .unwrap(); @@ -1003,15 +993,16 @@ impl Scheduler { fn wake_up_legacy_pessimistic_locks( &self, + group_name: &str, legacy_wake_up_list: impl IntoIterator, ReleasedLock)> + Send + 'static, delayed_wake_up_futures: impl IntoIterator + Send + 'static, ) { let self1 = self.clone(); - self.get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + let group_name1 = group_name.to_owned(); + self.get_sched_pool() + .spawn(group_name, CommandPri::High, async move { for (lock_info, released_lock) in legacy_wake_up_list { let cb = lock_info.key_cb.unwrap().into_inner(); let e = StorageError::from(Error::from(MvccError::from( @@ -1030,9 +1021,8 @@ impl Scheduler { for f in delayed_wake_up_futures { let self2 = self1.clone(); self1 - .get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + .get_sched_pool() + .spawn(&group_name1, CommandPri::High, async move { let res = f.await; if let Some(resumable_lock_wait_entry) = res { self2.schedule_awakened_pessimistic_locks( @@ -1121,7 +1111,7 @@ impl Scheduler { } /// Processes a read command within a worker thread, then posts - /// `ReadFinished` message back to the `Scheduler`. + /// `ReadFinished` message back to the `TxnScheduler`. fn process_read(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { fail_point!("txn_before_process_read"); debug!("process read cmd in worker pool"; "cid" => task.cid); @@ -1144,12 +1134,13 @@ impl Scheduler { /// Processes a write command within a worker thread, then posts either a /// `WriteFinished` message if successful or a `FinishedWithErr` message - /// back to the `Scheduler`. + /// back to the `TxnScheduler`. async fn process_write(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { fail_point!("txn_before_process_write"); let write_bytes = task.cmd.write_bytes(); let tag = task.cmd.tag(); let cid = task.cid; + let group_name = task.cmd.group_name(); let tracker = task.tracker; let scheduler = self.clone(); let quota_limiter = self.inner.quota_limiter.clone(); @@ -1285,7 +1276,7 @@ impl Scheduler { } let woken_up_resumable_entries = if !released_locks.is_empty() { - scheduler.on_release_locks(released_locks) + scheduler.on_release_locks(&group_name, released_locks) } else { smallvec![] }; @@ -1306,6 +1297,7 @@ impl Scheduler { false, new_acquired_locks, tag, + &group_name, ); return; } @@ -1336,6 +1328,7 @@ impl Scheduler { false, new_acquired_locks, tag, + &group_name, ); return; } @@ -1522,6 +1515,7 @@ impl Scheduler { is_async_apply_prewrite, new_acquired_locks, tag, + &group_name, ); KV_COMMAND_KEYWRITE_HISTOGRAM_VEC .get(tag) @@ -1828,7 +1822,7 @@ mod tests { } // TODO(cosven): use this in the following test cases to reduce duplicate code. - fn new_test_scheduler() -> (Scheduler, RocksEngine) { + fn new_test_scheduler() -> (TxnScheduler, RocksEngine) { let engine = TestEngineBuilder::new().build().unwrap(); let config = Config { scheduler_concurrency: 1024, @@ -1838,7 +1832,7 @@ mod tests { ..Default::default() }; ( - Scheduler::new( + TxnScheduler::new( engine.clone(), MockLockManager::new(), ConcurrencyManager::new(1.into()), @@ -1854,6 +1848,7 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), latest_feature_gate(), + Some(Arc::new(ResourceController::new("test".to_owned(), true))), ), engine, ) @@ -1978,31 +1973,7 @@ mod tests { #[test] fn test_acquire_latch_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut lock = Lock::new(&[Key::from_raw(b"b")]); let cid = scheduler.inner.gen_id(); @@ -2084,38 +2055,15 @@ mod tests { #[test] fn test_pool_available_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); // Spawn a task that sleeps for 500ms to occupy the pool. The next request // cannot run within 500ms. scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .spawn(async { thread::sleep(Duration::from_millis(500)) }) + .get_sched_pool() + .spawn("", CommandPri::Normal, async { + thread::sleep(Duration::from_millis(500)) + }) .unwrap(); let mut req = BatchRollbackRequest::default(); @@ -2144,31 +2092,7 @@ mod tests { #[test] fn test_flow_control_trottle_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut req = CheckTxnStatusRequest::default(); req.mut_context().max_execution_duration_ms = 100; @@ -2212,31 +2136,7 @@ mod tests { #[test] fn test_accumulate_many_expired_commands() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut lock = Lock::new(&[Key::from_raw(b"b")]); let cid = scheduler.inner.gen_id(); @@ -2283,7 +2183,7 @@ mod tests { let feature_gate = FeatureGate::default(); feature_gate.set_version("6.0.0").unwrap(); - let scheduler = Scheduler::new( + let scheduler = TxnScheduler::new( engine, MockLockManager::new(), ConcurrencyManager::new(1.into()), @@ -2299,6 +2199,7 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), feature_gate.clone(), + Some(Arc::new(ResourceController::new("test".to_owned(), true))), ); // Use sync mode if pipelined_pessimistic_lock is false. assert_eq!(scheduler.pessimistic_lock_mode(), PessimisticLockMode::Sync); diff --git a/tests/Cargo.toml b/tests/Cargo.toml index ae6c6984487..1cc0e6bce87 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -95,6 +95,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raft_log_engine = { workspace = true } raftstore = { workspace = true } rand = "0.8.3" +resource_control = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tempfile = "3.0" diff --git a/tests/benches/coprocessor_executors/index_scan/util.rs b/tests/benches/coprocessor_executors/index_scan/util.rs index 7531fb68944..8d579c98a4f 100644 --- a/tests/benches/coprocessor_executors/index_scan/util.rs +++ b/tests/benches/coprocessor_executors/index_scan/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::black_box; use futures::executor::block_on; use kvproto::coprocessor::KeyRange; @@ -33,7 +34,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchIndexScan store: &Store, unique: bool, ) -> Self::E { - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( black_box(TikvStorage::new( ToTxnStore::::to_store(store), false, diff --git a/tests/benches/coprocessor_executors/integrated/util.rs b/tests/benches/coprocessor_executors/integrated/util.rs index d9cb5fd2138..4b747307049 100644 --- a/tests/benches/coprocessor_executors/integrated/util.rs +++ b/tests/benches/coprocessor_executors/integrated/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::{black_box, measurement::Measurement}; use kvproto::coprocessor::KeyRange; use test_coprocessor::*; @@ -71,7 +72,7 @@ where store: &Store, ) { crate::util::bencher::BatchNextAllBencher::new(|| { - tidb_query_executors::runner::build_executors( + tidb_query_executors::runner::build_executors::<_, ApiV1>( black_box(executors.to_vec()), black_box(TikvStorage::new(ToTxnStore::::to_store(store), false)), black_box(ranges.to_vec()), diff --git a/tests/benches/coprocessor_executors/table_scan/util.rs b/tests/benches/coprocessor_executors/table_scan/util.rs index 2fe7c4fc4c0..0b2185074c8 100644 --- a/tests/benches/coprocessor_executors/table_scan/util.rs +++ b/tests/benches/coprocessor_executors/table_scan/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::black_box; use futures::executor::block_on; use kvproto::coprocessor::KeyRange; @@ -33,7 +34,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchTableScan store: &Store, _: (), ) -> Self::E { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( black_box(TikvStorage::new( ToTxnStore::::to_store(store), false, diff --git a/tests/benches/coprocessor_executors/util/mod.rs b/tests/benches/coprocessor_executors/util/mod.rs index 5ef442a25cd..0a5708c74ce 100644 --- a/tests/benches/coprocessor_executors/util/mod.rs +++ b/tests/benches/coprocessor_executors/util/mod.rs @@ -8,6 +8,7 @@ pub mod store; use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::{black_box, measurement::Measurement}; use kvproto::coprocessor::KeyRange; use test_coprocessor::*; @@ -41,7 +42,7 @@ pub fn build_dag_handler( let mut dag = DagRequest::default(); dag.set_executors(executors.to_vec().into()); - tikv::coprocessor::dag::DagHandlerBuilder::new( + tikv::coprocessor::dag::DagHandlerBuilder::<_, ApiV1>::new( black_box(dag), black_box(ranges.to_vec()), black_box(ToTxnStore::::to_store(store)), diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index ca0a473a8b7..92942fa90f9 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -69,7 +69,7 @@ fn test_pd_client_deadlock() { request!(client => block_on(get_gc_safe_point())), request!(client => block_on(get_store_and_stats(0))), request!(client => get_operator(0)), - request!(client => load_global_config(vec![])), + request!(client => load_global_config(String::default())), ]; for (name, func) in test_funcs { @@ -97,67 +97,6 @@ fn test_pd_client_deadlock() { fail::remove(pd_client_reconnect_fp); } -#[test] -fn test_load_global_config() { - let (mut _server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); - let res = futures::executor::block_on(async move { - client - .load_global_config( - ["abc", "123", "xyz"] - .iter() - .map(|x| x.to_string()) - .collect::>(), - ) - .await - }); - for (k, v) in res.unwrap() { - assert_eq!(k, format!("/global/config/{}", v)) - } -} - -#[test] -fn test_watch_global_config_on_closed_server() { - let (mut server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); - use futures::StreamExt; - let j = std::thread::spawn(move || { - let mut r = client.watch_global_config().unwrap(); - block_on(async move { - let mut i: usize = 0; - while let Some(r) = r.next().await { - match r { - Ok(res) => { - let change = &res.get_changes()[0]; - assert_eq!( - change - .get_name() - .split('/') - .collect::>() - .last() - .unwrap() - .to_owned(), - format!("{:?}", i) - ); - assert_eq!(change.get_value().to_owned(), format!("{:?}", i)); - i += 1; - } - Err(e) => { - if let grpcio::Error::RpcFailure(e) = e { - // 14-UNAVAILABLE - assert_eq!(e.code(), grpcio::RpcStatusCode::from(14)); - break; - } else { - panic!("other error occur {:?}", e) - } - } - } - } - }); - }); - thread::sleep(Duration::from_millis(200)); - server.stop(); - j.join().unwrap(); -} - // Updating pd leader may be slow, we need to make sure it does not block other // RPC in the same gRPC Environment. #[test] @@ -293,7 +232,9 @@ fn test_retry() { }); test_retry_success(&mut client, |c| block_on(c.get_gc_safe_point())); test_retry_success(&mut client, |c| c.get_operator(0)); - test_retry_success(&mut client, |c| block_on(c.load_global_config(vec![]))); + test_retry_success(&mut client, |c| { + block_on(c.load_global_config(String::default())) + }); fail::remove(pd_client_v2_timeout_fp); fail::remove(pd_client_v2_backoff_fp); diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index eb22ac29e45..d6cf7f1817d 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -1,17 +1,18 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + str::from_utf8, sync::{mpsc, Arc}, thread, time::Duration, }; use grpcio::EnvBuilder; -use kvproto::metapb::*; +use kvproto::{metapb::*, pdpb::GlobalConfigItem}; use pd_client::{PdClient, RegionInfo, RegionStat, RpcClient}; use security::{SecurityConfig, SecurityManager}; use test_pd::{mocker::*, util::*, Server as MockServer}; -use tikv_util::config::ReadableDuration; +use tikv_util::{config::ReadableDuration, worker::Builder}; fn new_test_server_and_client( update_interval: ReadableDuration, @@ -73,7 +74,7 @@ fn test_pd_client_deadlock() { request!(client => block_on(get_store_stats_async(0))), request!(client => get_operator(0)), request!(client => block_on(get_tso())), - request!(client => load_global_config(vec![])), + request!(client => load_global_config(String::default())), ]; for (name, func) in test_funcs { @@ -108,63 +109,93 @@ fn test_pd_client_deadlock() { #[test] fn test_load_global_config() { let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let res = futures::executor::block_on(async move { - client - .load_global_config( - ["abc", "123", "xyz"] - .iter() - .map(|x| x.to_string()) - .collect::>(), - ) - .await - }); - for (k, v) in res.unwrap() { - assert_eq!(k, format!("/global/config/{}", v)) + let global_items = vec![("test1", "val1"), ("test2", "val2"), ("test3", "val3")]; + let check_items = global_items.clone(); + if let Err(err) = futures::executor::block_on( + client.store_global_config( + String::from("global"), + global_items + .iter() + .map(|(name, value)| { + let mut item = GlobalConfigItem::default(); + item.set_name(name.to_string()); + item.set_payload(value.as_bytes().into()); + item + }) + .collect::>(), + ), + ) { + panic!("error occur {:?}", err); } + + let (res, revision) = + futures::executor::block_on(client.load_global_config(String::from("global"))).unwrap(); + assert!( + res.iter() + .zip(check_items) + .all(|(item1, item2)| item1.name == item2.0 && item1.payload == item2.1.as_bytes()) + ); + assert_eq!(revision, 3); } #[test] fn test_watch_global_config_on_closed_server() { let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let global_items = vec![("test1", "val1"), ("test2", "val2"), ("test3", "val3")]; + let items_clone = global_items.clone(); + let client = Arc::new(client); + let cli_clone = client.clone(); use futures::StreamExt; - let j = std::thread::spawn(move || { - futures::executor::block_on(async move { - let mut r = client.watch_global_config().unwrap(); - let mut i: usize = 0; - while let Some(r) = r.next().await { - match r { - Ok(res) => { - let change = &res.get_changes()[0]; - assert_eq!( - change - .get_name() - .split('/') - .collect::>() - .last() - .unwrap() - .to_owned(), - format!("{:?}", i) - ); - assert_eq!(change.get_value().to_owned(), format!("{:?}", i)); - i += 1; - } - Err(e) => { - if let grpcio::Error::RpcFailure(e) = e { - // 14-UNAVAILABLE - assert_eq!(e.code(), grpcio::RpcStatusCode::from(14)); - break; - } else { - panic!("other error occur {:?}", e) + let background_worker = Builder::new("background").thread_count(1).create(); + background_worker.spawn_async_task(async move { + match cli_clone.watch_global_config("global".into(), 0) { + Ok(mut stream) => { + let mut i: usize = 0; + while let Some(grpc_response) = stream.next().await { + match grpc_response { + Ok(r) => { + for item in r.get_changes() { + assert_eq!(item.get_name(), items_clone[i].0); + assert_eq!( + from_utf8(item.get_payload()).unwrap(), + items_clone[i].1 + ); + i += 1; + } } + Err(err) => panic!("failed to get stream, err: {:?}", err), } } } - }); + Err(err) => { + if !err.to_string().contains("UNAVAILABLE") { + // Not 14-UNAVAILABLE + panic!("other error occur {:?}", err) + } + } + } }); - thread::sleep(Duration::from_millis(200)); + + if let Err(err) = futures::executor::block_on( + client.store_global_config( + "global".into(), + global_items + .iter() + .map(|(name, value)| { + let mut item = GlobalConfigItem::default(); + item.set_name(name.to_string()); + item.set_payload(value.as_bytes().into()); + item + }) + .collect::>(), + ), + ) { + panic!("error occur {:?}", err); + } + + thread::sleep(Duration::from_millis(100)); server.stop(); - j.join().unwrap(); } // Updating pd leader may be slow, we need to make sure it does not block other diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 2508b544285..1a7d44db972 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -312,10 +312,7 @@ fn test_scale_scheduler_pool() { .update_config("storage.scheduler-worker-pool-size", &format!("{}", size)) .unwrap(); assert_eq!( - scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::Normal), size ); }; diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index cee75ff44b9..ef178ee8aa0 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -2,22 +2,12 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; +use collections::HashMap; use futures::executor::block_on; -use kvproto::metapb; +use kvproto::raft_serverpb::RaftApplyState; use pd_client::PdClient; use test_raftstore::*; -use tikv_util::store::find_peer; - -fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(true); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} +use tikv_util::{config::ReadableDuration, store::find_peer}; // Test the case local reader works well with witness peer. #[test] @@ -34,8 +24,12 @@ fn test_witness_update_region_in_local_reader() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); cluster.must_put(b"k0", b"v0"); @@ -60,8 +54,8 @@ fn test_witness_update_region_in_local_reader() { .read(None, request.clone(), Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() } @@ -69,3 +63,413 @@ fn test_witness_update_region_in_local_reader() { fail::remove("change_peer_after_update_region_store_3"); } + +// Test the case witness pull voter_replicated_index when has pending compact +// cmd. +#[test] +fn test_witness_raftlog_gc_pull_voter_replicated_index() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster + .cfg + .raft_store + .request_voter_replicated_index_interval = ReadableDuration::millis(100); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // one follower is down + cluster.stop_node(nodes[1]); + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } + + fail::cfg("on_raft_gc_log_tick", "return").unwrap(); + + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + fail::remove("on_raft_gc_log_tick"); +} + +// Test the case witness gc raftlog after reboot. +#[test] +fn test_witness_raftlog_gc_after_reboot() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster + .cfg + .raft_store + .request_voter_replicated_index_interval = ReadableDuration::millis(100); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // one follower is down + cluster.stop_node(nodes[1]); + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } + + fail::cfg("on_raft_gc_log_tick", "return").unwrap(); + + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + + // the witness is down + cluster.stop_node(nodes[2]); + std::thread::sleep(Duration::from_millis(100)); + // the witness is back online + cluster.run_node(nodes[2]).unwrap(); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + fail::remove("on_raft_gc_log_tick"); +} + +// Test the case request snapshot and apply successfully after non-witness +// restart. +#[test] +fn test_request_snapshot_after_reboot() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> nonwitness + let fp = "ignore request snapshot"; + fail::cfg(fp, "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // as we ignore request snapshot, so snapshot should still not applied yet + assert_eq!(cluster.pd_client.get_pending_peers().len(), 1); + must_get_none(&cluster.get_engine(3), b"k1"); + + cluster.stop_node(nodes[2]); + fail::remove(fp); + std::thread::sleep(Duration::from_millis(100)); + // the PeerState is Unavailable, so it will request snapshot immediately after + // start. + cluster.run_node(nodes[2]).unwrap(); + must_get_none(&cluster.get_engine(3), b"k1"); + std::thread::sleep(Duration::from_millis(500)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); +} + +// Test the case request snapshot and apply successfully after term change. +#[test] +fn test_request_snapshot_after_term_change() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> nonwitness + let fp1 = "ignore generate snapshot"; + fail::cfg(fp1, "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // as we ignore generate snapshot, so snapshot should still not applied yet + assert_eq!(cluster.pd_client.get_pending_peers().len(), 1); + must_get_none(&cluster.get_engine(3), b"k1"); + + let peer_on_store2 = find_peer(®ion, nodes[1]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + // After leader changes, the `term` and `last term` no longer match, so + // continue to receive `MsgAppend` until the two get equal, then retry to + // request snapshot and complete the application. + std::thread::sleep(Duration::from_millis(500)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); + fail::remove(fp1); +} + +fn test_non_witness_availability(fp: &str) { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.check_peers_availability_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // non-witness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + fail::cfg(fp, "return").unwrap(); + + // witness -> non-witness + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // snapshot applied + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); + fail::remove(fp); +} + +// Test the case leader pulls non-witness availability when non-witness failed +// to push the info. +#[test] +fn test_pull_non_witness_availability() { + test_non_witness_availability("ignore notify leader the peer is available"); +} + +// Test the case non-witness pushes its availability without leader pulling. +#[test] +fn test_push_non_witness_availability() { + test_non_witness_availability("ignore schedule check non-witness availability tick"); +} + +// Test the case non-witness hasn't finish applying snapshot when receives read +// request. +#[test] +fn test_non_witness_replica_read() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + // witness -> nonwitness + fail::cfg("ignore request snapshot", "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(100)); + // as we ignore request snapshot, so snapshot should still not applied yet + + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3.clone()); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { + region_id: region.get_id(), + ..Default::default() + } + ); + + // start requesting snapshot and give enough time for applying snapshot to + // complete + fail::remove("ignore request snapshot"); + std::thread::sleep(Duration::from_millis(500)); + + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request, Duration::from_millis(100)) + .unwrap(); + assert_eq!(resp.get_header().has_error(), false); +} diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 70e70b3cbe6..ff1babb7e1f 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -66,7 +66,7 @@ fn start_raftstore( ApplyRouter, RaftBatchSystem, ) { - let (raft_router, mut system) = create_raft_batch_system(&cfg.raft_store); + let (raft_router, mut system) = create_raft_batch_system(&cfg.raft_store, &None); let engines = create_tmp_engine(dir); let host = CoprocessorHost::default(); let importer = { diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index 1a82ec8005e..af03246acf4 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -45,7 +45,8 @@ fn start_server( .name_prefix(thd_name!("test-server")) .build(), ); - let (raft_router, _) = create_raft_batch_system::(&cfg.raft_store); + let (raft_router, _) = + create_raft_batch_system::(&cfg.raft_store, &None); let mut snap_worker = Worker::new("snap-handler").lazy_build("snap-handler"); let snap_worker_scheduler = snap_worker.scheduler(); let server_config = Arc::new(VersionTrack::new(cfg.server.clone())); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index c6f8e565218..bb35b069a41 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -180,6 +180,7 @@ fn test_serde_custom_tikv_config() { raft_entry_max_size: ReadableSize::mb(12), raft_log_compact_sync_interval: ReadableDuration::secs(12), raft_log_gc_tick_interval: ReadableDuration::secs(12), + request_voter_replicated_index_interval: ReadableDuration::minutes(5), raft_log_gc_threshold: 12, raft_log_gc_count_limit: Some(12), raft_log_gc_size_limit: Some(ReadableSize::kb(1)), @@ -254,6 +255,7 @@ fn test_serde_custom_tikv_config() { max_snapshot_file_raw_size: ReadableSize::gb(10), unreachable_backoff: ReadableDuration::secs(111), check_peers_availability_interval: ReadableDuration::secs(30), + check_request_snapshot_interval: ReadableDuration::minutes(1), }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { @@ -359,7 +361,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: false, - enable_compaction_guard: false, + enable_compaction_guard: Some(false), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -428,7 +430,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: Some(false), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Zstd, @@ -497,7 +499,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -566,7 +568,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -650,7 +652,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index b096437e60c..d79ec7899e2 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -90,7 +90,7 @@ a = "b" [storage] data-dir = "/var" -engine = "raft-kv2" +engine = "partitioned-raft-kv" gc-ratio-threshold = 1.2 max-key-size = 4096 scheduler-concurrency = 123 diff --git a/tests/integrations/coprocessor/test_checksum.rs b/tests/integrations/coprocessor/test_checksum.rs index 66df6b2832c..405070842b4 100644 --- a/tests/integrations/coprocessor/test_checksum.rs +++ b/tests/integrations/coprocessor/test_checksum.rs @@ -2,6 +2,7 @@ use std::u64; +use api_version::{keyspace::KvPair, ApiV1}; use futures::executor::block_on; use kvproto::{ coprocessor::{KeyRange, Request}, @@ -79,7 +80,7 @@ fn reversed_checksum_crc64_xor(store: &Store, range: KeyRange) -> Default::default(), false, ); - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: TikvStorage::new(store, false), ranges: vec![Range::from_pb_range(range, false)], scan_backward_in_range: true, @@ -89,10 +90,11 @@ fn reversed_checksum_crc64_xor(store: &Store, range: KeyRange) -> let mut checksum = 0; let digest = crc64fast::Digest::new(); - while let Some((k, v)) = block_on(scanner.next()).unwrap() { + while let Some(row) = block_on(scanner.next()).unwrap() { + let (k, v) = row.kv(); let mut digest = digest.clone(); - digest.write(&k); - digest.write(&v); + digest.write(k); + digest.write(v); checksum ^= digest.sum64(); } checksum diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index ad195f62774..056f24b5fee 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -4,11 +4,10 @@ use std::{cmp, thread, time::Duration}; use engine_traits::CF_LOCK; use kvproto::{ - coprocessor::{Request, Response, StoreBatchTask}, - errorpb, - kvrpcpb::{Context, IsolationLevel, LockInfo}, + coprocessor::{Request, Response, StoreBatchTask, StoreBatchTaskResponse}, + kvrpcpb::{Context, IsolationLevel}, }; -use protobuf::{Message, SingularPtrField}; +use protobuf::Message; use raftstore::store::Bucket; use test_coprocessor::*; use test_raftstore::{Cluster, ServerCluster}; @@ -2151,11 +2150,14 @@ fn test_batch_request() { } req }; - let verify_response = |result: &QueryResult, - data: &[u8], - region_err: &SingularPtrField, - locked: &SingularPtrField, - other_err: &String| { + let verify_response = |result: &QueryResult, resp: &Response| { + let (data, details, region_err, locked, other_err) = ( + resp.get_data(), + resp.get_exec_details_v2(), + &resp.region_error, + &resp.locked, + &resp.other_error, + ); match result { QueryResult::Valid(res) => { let expected_len = res.len(); @@ -2179,6 +2181,12 @@ fn test_batch_request() { assert!(region_err.is_none()); assert!(locked.is_none()); assert!(other_err.is_empty()); + let scan_details = details.get_scan_detail_v2(); + assert_eq!(scan_details.processed_versions, row_count as u64); + if row_count > 0 { + assert!(scan_details.processed_versions_size > 0); + assert!(scan_details.total_versions > 0); + } } QueryResult::ErrRegion => { assert!(region_err.is_some()); @@ -2198,6 +2206,20 @@ fn test_batch_request() { } }; + let batch_resp_2_resp = |batch_resp: &mut StoreBatchTaskResponse| -> Response { + let mut response = Response::default(); + response.set_data(batch_resp.take_data()); + if let Some(err) = batch_resp.region_error.take() { + response.set_region_error(err); + } + if let Some(lock_info) = batch_resp.locked.take() { + response.set_locked(lock_info); + } + response.set_other_error(batch_resp.take_other_error()); + response.set_exec_details_v2(batch_resp.take_exec_details_v2()); + response + }; + for (ranges, results, invalid_epoch, key_is_locked) in cases.iter() { let mut req = prepare_req(&mut cluster, ranges); if *invalid_epoch { @@ -2229,25 +2251,13 @@ fn test_batch_request() { } } let mut resp = handle_request(&endpoint, req); - let batch_results = resp.take_batch_responses().to_vec(); + let mut batch_results = resp.take_batch_responses().to_vec(); for (i, result) in results.iter().enumerate() { if i == 0 { - verify_response( - result, - resp.get_data(), - &resp.region_error, - &resp.locked, - &resp.other_error, - ); + verify_response(result, &resp); } else { - let batch_resp = batch_results.get(i - 1).unwrap(); - verify_response( - result, - batch_resp.get_data(), - &batch_resp.region_error, - &batch_resp.locked, - &batch_resp.other_error, - ); + let batch_resp = batch_results.get_mut(i - 1).unwrap(); + verify_response(result, &batch_resp_2_resp(batch_resp)); }; } if *key_is_locked { diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 8ede13bd0f4..ee063e0f1e7 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -42,7 +42,7 @@ fn test_node_bootstrap_with_prepared_data() { let pd_client = Arc::new(TestPdClient::new(0, false)); let cfg = new_tikv_config(0); - let (_, system) = fsm::create_raft_batch_system(&cfg.raft_store); + let (_, system) = fsm::create_raft_batch_system(&cfg.raft_store, &None); let simulate_trans = SimulateTransport::new(ChannelTransport::new()); let tmp_path = Builder::new().prefix("test_cluster").tempdir().unwrap(); let engine = diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index a2518cc64ae..f35b21b08a1 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -2,35 +2,18 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; +use collections::HashMap; use futures::executor::block_on; -use kvproto::{metapb, raft_cmdpb::ChangePeerRequest, raft_serverpb::PeerState}; +use kvproto::{ + metapb, + raft_cmdpb::ChangePeerRequest, + raft_serverpb::{PeerState, RaftApplyState}, +}; use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use test_raftstore::*; use tikv_util::store::find_peer; -fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(true); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} - -fn become_non_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(false); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} - // Test the case that region split or merge with witness peer #[test] fn test_witness_split_merge() { @@ -44,9 +27,12 @@ fn test_witness_split_merge() { let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); - + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); let before = cluster .apply_state(region.get_id(), nodes[2]) .get_applied_index(); @@ -91,8 +77,12 @@ fn test_witness_split_merge() { assert!(find_peer(&right, nodes[2]).unwrap().is_witness); // can't merge with different witness location - let mut peer_on_store3 = find_peer(&left, nodes[2]).unwrap().clone(); - become_non_witness(&cluster, left.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(&left, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + left.get_id(), + vec![peer_on_store3.get_id()], + vec![false], + ); let left = cluster.get_region(b"k1"); let req = new_admin_request( left.get_id(), @@ -169,6 +159,8 @@ fn test_witness_conf_change() { .pd_client .must_remove_peer(region.get_id(), peer_on_store3); + std::thread::sleep(Duration::from_millis(10)); + assert_eq!( cluster .region_local_state(region.get_id(), nodes[2]) @@ -177,246 +169,264 @@ fn test_witness_conf_change() { ); } -// #[test] -// // Test flow of switch witness -// fn test_witness_switch_witness() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// cluster.must_put(b"k1", b"v1"); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); - -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); - -// std::thread::sleep(Duration::from_millis(100)); -// must_get_none(&cluster.get_engine(3), b"k1"); - -// // witness -> nonwitness -// peer_on_store3.set_role(metapb::PeerRole::Learner); -// cluster -// .pd_client -// .must_add_peer(region.get_id(), peer_on_store3.clone()); -// cluster -// .pd_client -// .must_remove_peer(region.get_id(), peer_on_store3.clone()); -// peer_on_store3.set_is_witness(false); -// cluster -// .pd_client -// .must_add_peer(region.get_id(), peer_on_store3.clone()); -// std::thread::sleep(Duration::from_millis(100)); -// must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); -// } - -// TODO: add back when switch witness is supported -// // Test the case that leader is forbidden to become witness -// #[test] -// fn test_witness_leader() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// cluster.must_put(b"k1", b"v1"); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let mut peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); - -// // can't make leader to witness -// peer_on_store1.set_is_witness(true); -// cluster -// .pd_client -// .add_peer(region.get_id(), peer_on_store1.clone()); - -// std::thread::sleep(Duration::from_millis(100)); -// assert_eq!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// 1 -// ); -// // leader changes to witness failed, so still can get the value -// must_get_equal(&cluster.get_engine(nodes[0]), b"k1", b"v1"); - -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// // can't transfer leader to witness -// cluster.transfer_leader(region.get_id(), &mut peer_on_store3); -// assert_eq!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// nodes[0], -// ); -// } - -// TODO: add back when election priority is supported -// // Test the case that witness can't be elected as leader based on election -// // priority when there is no log gap -// #[test] -// fn test_witness_election_priority() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); -// cluster.must_put(b"k0", b"v0"); - -// // make sure logs are replicated to the witness -// std::thread::sleep(Duration::from_millis(100)); - -// for i in 1..10 { -// let node = -// cluster.leader_of_region(region.get_id()).unwrap().store_id; cluster. -// stop_node(node); let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// // the witness can't be elected as the leader when there is no log -// gap assert_ne!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// nodes[2], -// ); -// cluster.run_node(node).unwrap(); -// } -// } - -// TODO: add back when raft log gc logic is updated for witness -// // Test the case that truncated index won't advance when there is a witness -// even // if the gap gap exceeds the gc count limit -// #[test] -// fn test_witness_raftlog_gc_lagged_follower() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// cluster.must_put(b"k0", b"v0"); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); - -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(200)); -// let mut before_states = HashMap::default(); -// for (&id, engines) in &cluster.engines { -// let mut state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); before_states.insert(id, -// state.take_truncated_state()); } - -// // one follower is down -// cluster.stop_node(nodes[1]); - -// // write some data to make log gap exceeds the gc limit -// for i in 1..1000 { -// let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// } - -// // the truncated index is not advanced -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); assert!(state.get_truncated_state(). -// get_index() - before_states[&id].get_index() < 10); } - -// // the follower is back online -// cluster.run_node(nodes[1]).unwrap(); -// cluster.must_put(b"k00", b"v00"); -// must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(300)); - -// // the truncated index is advanced now, as all the peers has replicated -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); assert_ge!( -// state.get_truncated_state().get_index() - -// before_states[&id].get_index(), 900 -// ); -// } -// } - -// TODO: add back when raft log gc logic is updated for witness -// // Test the case that truncated index is advance when there is a lagged -// witness #[test] -// fn test_witness_raftlog_gc_lagged_witness() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); -// cluster.must_put(b"k0", b"v0"); - -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(200)); -// let mut before_states = HashMap::default(); -// for (&id, engines) in &cluster.engines { -// let mut state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); before_states.insert(id, -// state.take_truncated_state()); } - -// // the witness is down -// cluster.stop_node(nodes[2]); - -// // write some data to make log gap exceeds the gc limit -// for i in 1..1000 { -// let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// } - -// // the witness is back online -// cluster.run_node(nodes[2]).unwrap(); - -// cluster.must_put(b"k00", b"v00"); -// std::thread::sleep(Duration::from_millis(200)); - -// // the truncated index is advanced -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); println!("{} {}", id, -// state.get_truncated_state().get_index()); assert_ge!( -// state.get_truncated_state().get_index() - -// before_states[&id].get_index(), 900 -// ); -// } -// } +// Test flow of switch witness +#[test] +fn test_witness_switch_witness() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> non-witness + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![false], + ); + + std::thread::sleep(Duration::from_millis(100)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); +} + +// Test the case that leader is forbidden to become witness +#[test] +fn test_witness_leader() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // can't make leader to witness + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store1.get_id()], vec![true]); + + std::thread::sleep(Duration::from_millis(100)); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + 1 + ); + // leader changes to witness failed, so still can get the value + must_get_equal(&cluster.get_engine(nodes[0]), b"k1", b"v1"); + + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + // can't transfer leader to witness + cluster.transfer_leader(region.get_id(), peer_on_store3); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[0], + ); +} + +// Test the case that witness can't be elected as leader based on election +// priority when there is no log gap +#[test] +fn test_witness_election_priority() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + cluster.must_put(b"k0", b"v0"); + + // make sure logs are replicated to the witness + std::thread::sleep(Duration::from_millis(100)); + + for i in 1..10 { + let node = cluster.leader_of_region(region.get_id()).unwrap().store_id; + cluster.stop_node(node); + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + // the witness can't be elected as the leader when there is no log gap + assert_ne!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[2], + ); + cluster.run_node(node).unwrap(); + // make sure logs are replicated to the restarted node + std::thread::sleep(Duration::from_millis(100)); + } +} + +// Test the case that truncated index won't advance when there is a witness even +// if the gap gap exceeds the gc count limit +#[test] +fn test_witness_raftlog_gc_lagged_follower() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // one follower is down + cluster.stop_node(nodes[1]); + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } + + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } +} + +// Test the case that truncated index is advance when there is a lagged witness +#[test] +fn test_witness_raftlog_gc_lagged_witness() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + cluster.must_put(b"k0", b"v0"); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // the witness is down + cluster.stop_node(nodes[2]); + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + // the witness is back online + cluster.run_node(nodes[2]).unwrap(); + + cluster.must_put(b"k00", b"v00"); + std::thread::sleep(Duration::from_millis(200)); + + // the truncated index is advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } +} // Test the case replica read can't be performed on witness peer. #[test] @@ -435,8 +445,12 @@ fn test_witness_replica_read() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); let mut request = new_request( region.get_id(), @@ -451,15 +465,15 @@ fn test_witness_replica_read() { .read(None, request, Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() } ); } -fn must_get_error_recovery_in_progress( +fn must_get_error_is_witness( cluster: &mut Cluster, region: &metapb::Region, cmd: kvproto::raft_cmdpb::Request, @@ -474,8 +488,8 @@ fn must_get_error_recovery_in_progress( .call_command_on_leader(req, Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() }, @@ -501,9 +515,13 @@ fn test_witness_leader_down() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); - let mut peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); + let peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); // nonwitness -> witness - become_witness(&cluster, region.get_id(), &mut peer_on_store2); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store2.get_id()], + vec![true], + ); // the other follower is isolated cluster.add_send_filter(IsolationFilterFactory::new(3)); @@ -518,13 +536,13 @@ fn test_witness_leader_down() { // forbid writes let put = new_put_cmd(b"k3", b"v3"); - must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + must_get_error_is_witness(&mut cluster, ®ion, put); // forbid reads let get = new_get_cmd(b"k1"); - must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + must_get_error_is_witness(&mut cluster, ®ion, get); // forbid read index let read_index = new_read_index_cmd(); - must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + must_get_error_is_witness(&mut cluster, ®ion, read_index); let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store3); diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 496c587a7b9..61a3fb39097 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -711,19 +711,17 @@ fn test_mvcc_flashback() { } #[test] -#[cfg(feature = "failpoints")] fn test_mvcc_flashback_block_rw() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); - // Flashback - must_flashback_to_version(&client, ctx.clone(), 0, 1, 2); - // Try to read. + // Prepare the flashback. + must_prepare_flashback(&client, ctx.clone(), 1, 2); + // Try to read version 3 (after flashback, FORBIDDEN). let (k, v) = (b"key".to_vec(), b"value".to_vec()); // Get let mut get_req = GetRequest::default(); get_req.set_context(ctx.clone()); get_req.key = k.clone(); - get_req.version = 1; + get_req.version = 3; let get_resp = client.kv_get(&get_req).unwrap(); assert!(get_resp.get_region_error().has_flashback_in_progress()); assert!(!get_resp.has_error()); @@ -733,28 +731,48 @@ fn test_mvcc_flashback_block_rw() { scan_req.set_context(ctx.clone()); scan_req.start_key = k.clone(); scan_req.limit = 1; - scan_req.version = 1; + scan_req.version = 3; let scan_resp = client.kv_scan(&scan_req).unwrap(); assert!(scan_resp.get_region_error().has_flashback_in_progress()); + assert!(!scan_resp.has_error()); assert!(scan_resp.pairs.is_empty()); - // Try to write. + // Try to read version 1 (before flashback, ALLOWED). + // Get + let mut get_req = GetRequest::default(); + get_req.set_context(ctx.clone()); + get_req.key = k.clone(); + get_req.version = 1; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert!(get_resp.value.is_empty()); + // Scan + let mut scan_req = ScanRequest::default(); + scan_req.set_context(ctx.clone()); + scan_req.start_key = k.clone(); + scan_req.limit = 1; + scan_req.version = 1; + let scan_resp = client.kv_scan(&scan_req).unwrap(); + assert!(!scan_resp.has_region_error()); + assert!(!scan_resp.has_error()); + assert!(scan_resp.pairs.is_empty()); + // Try to write (FORBIDDEN). // Prewrite let mut mutation = Mutation::default(); mutation.set_op(Op::Put); mutation.set_key(k.clone()); mutation.set_value(v); - let prewrite_resp = try_kv_prewrite(&client, ctx, vec![mutation], k, 1); + let prewrite_resp = try_kv_prewrite(&client, ctx.clone(), vec![mutation], k, 1); assert!(prewrite_resp.get_region_error().has_flashback_in_progress()); - fail::remove("skip_finish_flashback_to_version"); + // Finish the flashback. + must_finish_flashback(&client, ctx, 1, 2, 3); } #[test] -#[cfg(feature = "failpoints")] fn test_mvcc_flashback_block_scheduling() { let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); - fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); - // Flashback - must_flashback_to_version(&client, ctx, 0, 1, 2); + // Prepare the flashback. + must_prepare_flashback(&client, ctx.clone(), 0, 1); // Try to transfer leader. let transfer_leader_resp = cluster.try_transfer_leader(1, new_peer(2, 2)); assert!( @@ -763,7 +781,8 @@ fn test_mvcc_flashback_block_scheduling() { .get_error() .has_flashback_in_progress() ); - fail::remove("skip_finish_flashback_to_version"); + // Finish the flashback. + must_finish_flashback(&client, ctx, 0, 1, 2); } #[test] @@ -794,16 +813,7 @@ fn test_mvcc_flashback_unprepared() { assert!(!get_resp.has_error()); assert_eq!(get_resp.value, b"".to_vec()); // Mock the flashback retry. - let mut req = FlashbackToVersionRequest::default(); - req.set_context(ctx); - req.set_start_ts(6); - req.set_commit_ts(7); - req.version = 0; - req.start_key = b"a".to_vec(); - req.end_key = b"z".to_vec(); - let resp = client.kv_flashback_to_version(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(resp.get_error().is_empty()); + must_finish_flashback(&client, ctx.clone(), 0, 6, 7); let get_resp = client.kv_get(&get_req).unwrap(); assert!(!get_resp.has_region_error()); assert!(!get_resp.has_error()); @@ -811,7 +821,7 @@ fn test_mvcc_flashback_unprepared() { } #[test] -fn test_mvcc_flashback_with_unlimit_range() { +fn test_mvcc_flashback_with_unlimited_range() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; @@ -966,7 +976,7 @@ fn test_debug_raft_log() { entry.set_entry_type(eraftpb::EntryType::EntryNormal); entry.set_data(vec![42].into()); let mut lb = engine.log_batch(0); - lb.append(region_id, vec![entry.clone()]).unwrap(); + lb.append(region_id, None, vec![entry.clone()]).unwrap(); engine.consume(&mut lb, false).unwrap(); assert_eq!( engine.get_entry(region_id, log_index).unwrap().unwrap(), diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 452bcc89238..dc0a85bc9c2 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,9 +159,8 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let kv_db_opts = cfg - .rocksdb - .build_opt(&cfg.rocksdb.build_resources(Default::default())); + let resource = cfg.rocksdb.build_resources(Default::default()); + let kv_db_opts = cfg.rocksdb.build_opt(&resource, cfg.storage.engine); let kv_cfs_opts = cfg.rocksdb.build_cf_opts( &cfg.rocksdb.build_cf_resources(cache), None,