diff --git a/Makefile.am b/Makefile.am
index 2b1023b..e6c8489 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,7 +14,8 @@ nobase_dist_conf_DATA = scripts/lbnl_cmd.nhc scripts/common.nhc \
scripts/lbnl_fs.nhc scripts/lbnl_hw.nhc \
scripts/lbnl_job.nhc scripts/lbnl_moab.nhc \
scripts/lbnl_net.nhc scripts/lbnl_nv.nhc \
- scripts/lbnl_ps.nhc
+ scripts/lbnl_ps.nhc \
+ scripts/osc_gpfs.nhc
MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure install-sh missing
DISTCLEANFILES =
diff --git a/README.md b/README.md
index 0b375e0..27980e3 100644
--- a/README.md
+++ b/README.md
@@ -452,6 +452,7 @@ The table below provides a list of the configuration variables which may be used
| MCELOG_MAX_CORRECTED_RATE | `9` | Maximum number of **corrected** MCEs allowed before `check_hw_mcelog()` returns failure |
| MCELOG_MAX_UNCORRECTED_RATE | `0` | Maximum number of **uncorrected** MCEs allowed before `check_hw_mcelog()` returns failure |
| MDIAG_CMD | `mdiag` | Command to use to invoke Moab's `mdiag` command (may include path) |
+| MMHEALTH | `/usr/lpp/mmfs/bin/mmhealth` | Command to use to invoke the GPFS `mmhealth` command |
| *NAME | `nhc` | Used to populate default paths/filenames for configuration |
| NHC_AUTH_USERS | `root nobody` | Users authorized to have arbitrary processes running on compute nodes |
| NHC_CHECK_ALL | `0` | Forces all checks to be non-fatal. Displays each failure message, reports total number of failed checks, and returns that number. |
@@ -771,6 +772,40 @@ _**Example**_: `check_fs_used / 98%`
+##### check_gpfs_health
+`check_gpfs_health [-0] [-a] [-l] [-s] [-e ] `
+
+Checks the health of a GPFS component. The value for _component_ must match a component reported by mmhealth.
+
+| **Check Option** | **Purpose** |
+| ---------------- | ----------- |
+| `-0` | Non-fatal. Failure of this check will be ignored. |
+| `-a` | Find, report, and act on all matching components. Default behavior is to fail check after first matching component. |
+| `-l` | Log unhealthy component (or components, if used with `-a`) to NHC log (`$LOGFILE`). |
+| `-s` | Log unhealthy component (or components, if used with `-a`) to the syslog. |
+| `-e`_`action`_ | Execute `/bin/bash -c`_`action`_ if component is NOT healthy. |
+
+_**Example**_: `check_gpfs_health NETWORK`
+
+
+
+
+##### check_gpfs_verbs_status
+`check_gpfs_verbs_status [-0] [-l] [-s]`
+
+Checks that GPFS has started Verbs and is using RDMA.
+
+| **Check Option** | **Purpose** |
+| ---------------- | ----------- |
+| `-0` | Non-fatal. Failure of this check will be ignored. |
+| `-l` | Log if verbs is not started to NHC log (`$LOGFILE`). |
+| `-s` | Log if verbs is not started to the syslog. |
+
+_**Example**_: `check_gpfs_verbs_status`
+
+
+
+
##### check_hw_cpuinfo
`check_hw_cpuinfo [sockets] [cores] [threads]`
diff --git a/scripts/osc_gpfs.nhc b/scripts/osc_gpfs.nhc
new file mode 100644
index 0000000..24c0818
--- /dev/null
+++ b/scripts/osc_gpfs.nhc
@@ -0,0 +1,173 @@
+# OSC NHC - GPFS checks
+#
+# Trey Dockendorf
+# 1 November 2018
+#
+
+GPFS_COMPONENT=()
+GPFS_ENTITY=()
+GPFS_STATUS=()
+MMHEALTH="${MMHEALTH:-/usr/lpp/mmfs/bin/mmhealth}"
+MMFSADM="${MMFSADM:-/usr/lpp/mmfs/bin/mmfsadm}"
+export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS MMHEALTH MMFSADM
+
+function nhc_gpfs_health_gather_data() {
+ local LINE_CNT
+ local -a FIELD
+
+ GPFS_COMPONENT=() GPFS_ENTITY=() GPFS_STATUS=()
+
+ ((LINE_CNT=0))
+ while IFS=: read -a FIELD; do
+ if [[ "${FIELD[2]}" == "HEADER" ]]; then
+ continue
+ fi
+ if [[ "${FIELD[1]}" == "Event" ]]; then
+ continue
+ fi
+ if [[ "${FIELD[9]}" != "NODE" ]]; then
+ continue
+ fi
+ GPFS_COMPONENT[$LINE_CNT]="${FIELD[7]}"
+ GPFS_ENTITY[$LINE_CNT]="${FIELD[8]}"
+ GPFS_STATUS[$LINE_CNT]="${FIELD[10]}"
+ dbg "Got GPFS health ${GPFS_COMPONENT[$LINE_CNT]} ${GPFS_ENTITY[$LINE_CNT]} ${GPFS_STATUS[$LINE_CNT]}"
+ ((LINE_CNT++))
+ done < <($MMHEALTH node show -Y)
+
+ export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS
+}
+
+# Checks GPFS health for a given component
+# check_gpfs_health [-0] [-a] [-l] [-s] [-e ]
+function check_gpfs_health() {
+ local NONFATAL=0 ALL=0 LOG=0 SYSLOG=0 ACTION=""
+ local THIS_COMPONENT THIS_ENTITY THIS_STATUS MSG i
+
+ if [[ ${#GPFS_COMPONENT[*]} -eq 0 ]]; then
+ nhc_gpfs_health_gather_data
+ fi
+
+ OPTIND=1
+ while getopts ":0alse:" OPTION ; do
+ case "$OPTION" in
+ 0) NONFATAL=1 ;;
+ a) ALL=1 ;;
+ l) LOG=1 ;;
+ s) SYSLOG=1 ;;
+ e) ACTION="$OPTARG" ;;
+ :) die 1 "$CHECK: Option -$OPTARG requires an argument." ; return 1 ;;
+ \?) die 1 "$CHECK: Invalid option: -$OPTARG" ; return 1 ;;
+ esac
+ done
+ shift $((OPTIND-1))
+ COMPONENT="$1"
+ if [[ -z "$COMPONENT" ]]; then
+ die 1 "$CHECK: Syntax error: Must provide component to check."
+ fi
+ dbg "Looking for GPFS health component \"$COMPONENT\""
+ for ((i=0; i < ${#GPFS_COMPONENT[*]}; i++)); do
+ THIS_COMPONENT="${GPFS_COMPONENT[$i]}"
+ THIS_ENTITY="${GPFS_ENTITY[$i]}"
+ THIS_STATUS="${GPFS_STATUS[$i]}"
+ dbg "CHECKING \"$THIS_COMPONENT\" vs. \"$GPFS_COMPONENT\""
+ if ! mcheck "$THIS_COMPONENT" "$COMPONENT"; then
+ continue
+ fi
+ dbg "Matching GPFS health found: $THIS_COMPONENT: entity=$THIS_ENTITY status=$THIS_STATUS"
+ if [[ "$THIS_STATUS" == "HEALTHY" ]]; then
+ continue
+ else
+ MSG="$CHECK: GPFS health for \"$THIS_COMPONENT\" is $THIS_STATUS"
+ fi
+ # We have a winner. Or loser, as the case may be.
+ if [[ "$LOG" == "1" ]]; then
+ log $MSG
+ fi
+ if [[ "$SYSLOG" == "1" ]]; then
+ syslog $MSG
+ fi
+ if [[ "$ACTION" != "" ]]; then
+ ${SHELL:-/bin/bash} -c "$ACTION" &
+ fi
+ if [[ $ALL -ge 1 ]]; then
+ if [[ -n "$MSG" ]]; then
+ log "$MSG ($ALL)"
+ fi
+ ((ALL++))
+ continue
+ elif [[ $NONFATAL == 1 ]]; then
+ if [[ -n "$MSG" ]]; then
+ log "$MSG (non-fatal)"
+ fi
+ return 0
+ fi
+ die 1 "$MSG"
+ return 1
+ done
+ # -a (all) does not necessarily imply -0 (non-fatal). A value of 1 for $ALL
+ # means -a was passed in but no errors were found. 2 or above is an error.
+ if [[ $ALL -gt 1 ]]; then
+ # We had at least 1 flagged process. Fail unless we're also non-fatal.
+ if [[ $NONFATAL == 1 ]]; then
+ if [[ -n "$MSG" ]]; then
+ log "$MSG (non-fatal)"
+ fi
+ return 0
+ fi
+ ((ALL--))
+ die $ALL "$MSG (last of $ALL)"
+ return $ALL
+ fi
+ return 0
+}
+
+# Checks GPFS verbs status
+# check_gpfs_verbs_status [-0] [-l] [-s]
+function check_gpfs_verbs_status() {
+ local NONFATAL=0 LOG=0 SYSLOG=0 MSG=''
+ local RET OUTPUT OLD_DEBUG
+
+ OPTIND=1
+ while getopts ":0ls" OPTION ; do
+ case "$OPTION" in
+ 0) NONFATAL=1 ;;
+ l) LOG=1 ;;
+ s) SYSLOG=1 ;;
+ :) die 1 "$CHECK: Option -$OPTARG requires an argument." ; return 1 ;;
+ \?) die 1 "$CHECK: Invalid option: -$OPTARG" ; return 1 ;;
+ esac
+ done
+ shift $((OPTIND-1))
+
+ OLD_DEBUG=$DEBUG
+ unset DEBUG
+ check_cmd_output -t ${CMD_TIMEOUT:-5} -C "$FUNCNAME" -O OUTPUT -m '/status/' $MMFSADM test verbs status
+ RET=$?
+ export DEBUG=$OLD_DEBUG
+ if [[ $RET -ne 0 ]]; then
+ return $RET
+ fi
+
+ dbg "$MMFSADM test verbs status: \"$OUTPUT\""
+
+ if [[ "$OUTPUT" == *": started" ]]; then
+ return 0
+ else
+ MSG="$FUNCNAME: GPFS verbs is not started"
+ fi
+ if [[ "$LOG" == "1" ]]; then
+ log $MSG
+ fi
+ if [[ "$SYSLOG" == "1" ]]; then
+ syslog $MSG
+ fi
+ if [[ $NONFATAL == 1 ]]; then
+ if [[ -n "$MSG" ]]; then
+ log "$MSG (non-fatal)"
+ fi
+ return 0
+ fi
+ die 1 "$MSG"
+ return 1
+}