From ddf5f34f06b0356d5c27a70300758a3876cd9c2d Mon Sep 17 00:00:00 2001 From: Don Brady Date: Wed, 18 Sep 2024 12:36:48 -0600 Subject: [PATCH] Avoid fault diagnosis if multiple vdevs have errors When multiple drives are throwing errors, it is likely not a drive failing but rather a failure above the drives, like a controller. The active cases context of the drive's peers is now considered when making a diagnosis. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed by: Brian Behlendorf Signed-off-by: Don Brady Closes #16531 --- cmd/zed/agents/zfs_diagnosis.c | 101 ++++++++--- tests/runfiles/linux.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../events/zed_diagnose_multiple.ksh | 168 ++++++++++++++++++ .../events/zed_slow_io_many_vdevs.ksh | 4 +- 5 files changed, 244 insertions(+), 32 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c index e35cd0756c60..b45b89af430f 100644 --- a/cmd/zed/agents/zfs_diagnosis.c +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -71,6 +71,7 @@ typedef struct zfs_case_data { uint64_t zc_ena; uint64_t zc_pool_guid; uint64_t zc_vdev_guid; + uint64_t zc_parent_guid; int zc_pool_state; char zc_serd_checksum[MAX_SERDLEN]; char zc_serd_io[MAX_SERDLEN]; @@ -181,10 +182,10 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) } /* - * count other unique slow-io cases in a pool + * Return count of other unique SERD cases under same vdev parent */ static uint_t -zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case) +zfs_other_serd_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case) { zfs_case_t *zcp; uint_t cases = 0; @@ -206,10 +207,32 @@ zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case) for (zcp = uu_list_first(zfs_cases); zcp != NULL; zcp = uu_list_next(zfs_cases, zcp)) { - if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid && - zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid && - zcp->zc_data.zc_serd_slow_io[0] != '\0' && - fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) { + zfs_case_data_t *zcd = &zcp->zc_data; + + /* + * must be same pool and parent vdev but different leaf vdev + */ + if (zcd->zc_pool_guid != zfs_case->zc_pool_guid || + zcd->zc_parent_guid != zfs_case->zc_parent_guid || + zcd->zc_vdev_guid == zfs_case->zc_vdev_guid) { + continue; + } + + /* + * Check if there is another active serd case besides zfs_case + * + * Only one serd engine will be assigned to the case + */ + if (zcd->zc_serd_checksum[0] == zfs_case->zc_serd_checksum[0] && + fmd_serd_active(hdl, zcd->zc_serd_checksum)) { + cases++; + } + if (zcd->zc_serd_io[0] == zfs_case->zc_serd_io[0] && + fmd_serd_active(hdl, zcd->zc_serd_io)) { + cases++; + } + if (zcd->zc_serd_slow_io[0] == zfs_case->zc_serd_slow_io[0] && + fmd_serd_active(hdl, zcd->zc_serd_slow_io)) { cases++; } } @@ -502,6 +525,34 @@ zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) } } +/* + * Record the specified event in the SERD engine and return a + * boolean value indicating whether or not the engine fired as + * the result of inserting this event. + * + * When the pool has similar active cases on other vdevs, then + * the fired state is disregarded and the case is retired. + */ +static int +zfs_fm_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep, + zfs_case_t *zcp, const char *err_type) +{ + int fired = fmd_serd_record(hdl, name, ep); + int peers = 0; + + if (fired && (peers = zfs_other_serd_cases(hdl, &zcp->zc_data)) > 0) { + fmd_hdl_debug(hdl, "pool %llu is tracking %d other %s cases " + "-- skip faulting the vdev %llu", + (u_longlong_t)zcp->zc_data.zc_pool_guid, + peers, err_type, + (u_longlong_t)zcp->zc_data.zc_vdev_guid); + zfs_case_retire(hdl, zcp); + fired = 0; + } + + return (fired); +} + /* * Main fmd entry point. */ @@ -510,7 +561,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) { zfs_case_t *zcp, *dcp; int32_t pool_state; - uint64_t ena, pool_guid, vdev_guid; + uint64_t ena, pool_guid, vdev_guid, parent_guid; uint64_t checksum_n, checksum_t; uint64_t io_n, io_t; er_timeval_t pool_load; @@ -600,6 +651,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) vdev_guid = 0; + if (nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, &parent_guid) != 0) + parent_guid = 0; if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) ena = 0; @@ -710,6 +764,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) data.zc_ena = ena; data.zc_pool_guid = pool_guid; data.zc_vdev_guid = vdev_guid; + data.zc_parent_guid = parent_guid; data.zc_pool_state = (int)pool_state; fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); @@ -872,8 +927,10 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) SEC2NSEC(io_t)); zfs_case_serialize(zcp); } - if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) + if (zfs_fm_serd_record(hdl, zcp->zc_data.zc_serd_io, + ep, zcp, "io error")) { checkremove = B_TRUE; + } } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) { uint64_t slow_io_n, slow_io_t; @@ -899,25 +956,10 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) } /* Pass event to SERD engine and see if this triggers */ if (zcp->zc_data.zc_serd_slow_io[0] != '\0' && - fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io, - ep)) { - /* - * Ignore a slow io diagnosis when other - * VDEVs in the pool show signs of being slow. - */ - if (zfs_other_slow_cases(hdl, &zcp->zc_data)) { - zfs_case_retire(hdl, zcp); - fmd_hdl_debug(hdl, "pool %llu has " - "multiple slow io cases -- skip " - "degrading vdev %llu", - (u_longlong_t) - zcp->zc_data.zc_pool_guid, - (u_longlong_t) - zcp->zc_data.zc_vdev_guid); - } else { - zfs_case_solve(hdl, zcp, - "fault.fs.zfs.vdev.slow_io"); - } + zfs_fm_serd_record(hdl, + zcp->zc_data.zc_serd_slow_io, ep, zcp, "slow io")) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.slow_io"); } } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { @@ -968,8 +1010,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) SEC2NSEC(checksum_t)); zfs_case_serialize(zcp); } - if (fmd_serd_record(hdl, - zcp->zc_data.zc_serd_checksum, ep)) { + if (zfs_fm_serd_record(hdl, + zcp->zc_data.zc_serd_checksum, ep, zcp, + "checksum")) { zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.checksum"); } diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4613c895b0cd..d791ce57c3fe 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -109,7 +109,7 @@ tags = ['functional', 'direct'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', - 'zed_slow_io', 'zed_slow_io_many_vdevs'] + 'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple'] tags = ['functional', 'events'] [tests/functional/fadvise:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 053a2c09f649..131624c2e8ef 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1490,6 +1490,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/setup.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ + functional/events/zed_diagnose_multiple.ksh \ functional/events/zed_fd_spill.ksh \ functional/events/zed_io_config.ksh \ functional/events/zed_rc_filter.ksh \ diff --git a/tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh b/tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh new file mode 100755 index 000000000000..aea9f1dd34de --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_diagnose_multiple.ksh @@ -0,0 +1,168 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc. +# + +# DESCRIPTION: +# Verify that simultaneous io error events from multiple vdevs +# doesn't generate a fault +# +# STRATEGY: +# 1. Create a pool with a 4 disk raidz vdev +# 2. Inject io errors +# 3. Verify that ZED detects the errors but doesn't fault any vdevs +# + +. $STF_SUITE/include/libtest.shlib + +TESTDIR="$TEST_BASE_DIR/zed_error_multiple" +VDEV1="$TEST_BASE_DIR/vdevfile1.$$" +VDEV2="$TEST_BASE_DIR/vdevfile2.$$" +VDEV3="$TEST_BASE_DIR/vdevfile3.$$" +VDEV4="$TEST_BASE_DIR/vdevfile4.$$" +VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4" +TESTPOOL="zed_test_pool" +FILEPATH="$TESTDIR/zed.testfile" + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + # if pool still exists then something failed so log additional info + if poolexists $TESTPOOL ; then + log_note "$(zpool status -s $TESTPOOL)" + echo "=================== zed log search ===================" + grep "Diagnosis Engine" $ZEDLET_DIR/zed.log + destroy_pool $TESTPOOL + fi + log_must zed_stop + + log_must rm -f $VDEVS +} + +function start_io_errors +{ + for vdev in $VDEVS + do + log_must zpool set io_n=4 $TESTPOOL $vdev + log_must zpool set io_t=60 $TESTPOOL $vdev + done + zpool sync + + for vdev in $VDEVS + do + log_must zinject -d $vdev -e io $TESTPOOL + done + zpool sync +} + +function multiple_slow_vdevs_test +{ + log_must truncate -s 1G $VDEVS + default_raidz_setup_noexit $VDEVS + + log_must zpool events -c + log_must zfs set compression=off $TESTPOOL + log_must zfs set primarycache=none $TESTPOOL + log_must zfs set recordsize=4K $TESTPOOL + + log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=4 + zpool sync + + # + # Read the file with io errors injected on the disks + # This will cause multiple errors on each disk to trip ZED SERD + # + # pool: zed_test_pool + # state: ONLINE + # status: One or more devices has experienced an unrecoverable error. An + # attempt was made to correct the error. Applications are unaffected. + # action: Determine if the device needs to be replaced, and clear the errors + # using 'zpool clear' or replace the device with 'zpool replace'. + # see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-9P + # config: + # + # NAME STATE READ WRITE CKSUM + # zed_test_pool ONLINE 0 0 0 + # raidz1-0 ONLINE 0 0 0 + # /var/tmp/vdevfile1.1547063 ONLINE 532 561 0 + # /var/tmp/vdevfile2.1547063 ONLINE 547 594 0 + # /var/tmp/vdevfile3.1547063 ONLINE 1.05K 1.10K 0 + # /var/tmp/vdevfile4.1547063 ONLINE 1.05K 1.00K 0 + # + + start_io_errors + dd if=$FILEPATH of=/dev/null bs=1M count=4 2>/dev/null + log_must zinject -c all + + # count io error events available for processing + typeset -i i=0 + typeset -i events=0 + while [[ $i -lt 60 ]]; do + events=$(zpool events | grep "ereport\.fs\.zfs.io" | wc -l) + [[ $events -ge "50" ]] && break + i=$((i+1)) + sleep 1 + done + log_note "$events io error events found" + if [[ $events -lt "50" ]]; then + log_note "bailing: not enough events to complete the test" + destroy_pool $TESTPOOL + return + fi + + # + # give slow ZED a chance to process the checkum events + # + typeset -i i=0 + typeset -i skips=0 + while [[ $i -lt 75 ]]; do + skips=$(grep "retiring case" \ + $ZEDLET_DIR/zed.log | wc -l) + [[ $skips -gt "0" ]] && break + i=$((i+1)) + sleep 1 + done + + log_note $skips fault skips in ZED log after $i seconds + [ $skips -gt "0" ] || log_fail "expecting to see skips" + + fault=$(grep "zpool_vdev_fault" $ZEDLET_DIR/zed.log | wc -l) + log_note $fault vdev fault in ZED log + [ $fault -eq "0" ] || \ + log_fail "expecting no fault events, found $fault" + + destroy_pool $TESTPOOL +} + +log_assert "Test ZED io errors across multiple vdevs" +log_onexit cleanup + +log_must zed_events_drain +log_must zed_start +multiple_slow_vdevs_test + +log_pass "Test ZED io errors across multiple vdevs" diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh index 3357ae2e3510..e27ef91cbdc7 100755 --- a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh +++ b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh @@ -25,10 +25,10 @@ # # DESCRIPTION: -# Verify that delay events from multiple vdevs doesnt degrade +# Verify that delay events from multiple vdevs doesn't degrade # # STRATEGY: -# 1. Create a pool with a 3 disk raidz vdev +# 1. Create a pool with a 4 disk raidz vdev # 2. Inject slow io errors # 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs #