From 5ef4db74649b8be03402c17aa29c024e71699a7b Mon Sep 17 00:00:00 2001
From: AndrewEichmann-NOAA
 <58948505+AndrewEichmann-NOAA@users.noreply.github.com>
Date: Thu, 11 Jul 2024 08:59:24 -0400
Subject: [PATCH 1/3] Adds contents of constructor and initialize methods to
 marine LETKF class (#2635)

Adds contents of constructor and initialize methods to marine LETKF class

Partially addresses https://github.com/NOAA-EMC/GDASApp/issues/1091
---------

Co-authored-by: Rahul Mahajan <aerorahul@users.noreply.github.com>
Co-authored-by: Cory Martin <cory.r.martin@noaa.gov>
---
 env/HERA.env                                  | 10 +--
 env/ORION.env                                 | 10 +--
 ...IS_LETKF => JGLOBAL_MARINE_ANALYSIS_LETKF} |  8 +-
 .../{ocnanalletkf.sh => marineanalletkf.sh}   |  4 +-
 parm/config/gfs/config.marineanalletkf        | 18 ++++
 parm/config/gfs/config.ocnanal                |  4 +-
 parm/config/gfs/config.ocnanalletkf           | 11 ---
 parm/config/gfs/config.resources              | 20 ++---
 ush/python/pygfs/task/marine_letkf.py         | 83 ++++++++++++++++++-
 9 files changed, 126 insertions(+), 42 deletions(-)
 rename jobs/{JGDAS_GLOBAL_OCEAN_ANALYSIS_LETKF => JGLOBAL_MARINE_ANALYSIS_LETKF} (82%)
 rename jobs/rocoto/{ocnanalletkf.sh => marineanalletkf.sh} (87%)
 create mode 100644 parm/config/gfs/config.marineanalletkf
 delete mode 100644 parm/config/gfs/config.ocnanalletkf

diff --git a/env/HERA.env b/env/HERA.env
index db63f0bfa58..b743a19a624 100755
--- a/env/HERA.env
+++ b/env/HERA.env
@@ -140,13 +140,13 @@ elif [[ "${step}" = "ocnanalecen" ]]; then
     [[ ${NTHREADS_OCNANALECEN} -gt ${nth_max} ]] && export NTHREADS_OCNANALECEN=${nth_max}
     export APRUN_OCNANALECEN="${launcher} -n ${npe_ocnanalecen} --cpus-per-task=${NTHREADS_OCNANALECEN}"
 
-elif [[ "${step}" = "ocnanalletkf" ]]; then
+elif [[ "${step}" = "marineanalletkf" ]]; then
 
-    nth_max=$((npe_node_max / npe_node_ocnanalletkf))
+    nth_max=$((npe_node_max / npe_node_marineanalletkf))
 
-    export NTHREADS_OCNANALLETKF=${nth_ocnanalletkf:-${nth_max}}
-    [[ ${NTHREADS_OCNANALLETKF} -gt ${nth_max} ]] && export NTHREADS_OCNANALLETKF=${nth_max}
-    export APRUN_OCNANALLETKF="${launcher} -n ${npe_ocnanalletkf} --cpus-per-task=${NTHREADS_OCNANALLETKF}"
+    export NTHREADS_MARINEANALLETKF=${nth_marineanalletkf:-${nth_max}}
+    [[ ${NTHREADS_MARINEANALLETKF} -gt ${nth_max} ]] && export NTHREADS_MARINEANALLETKF=${nth_max}
+    export APRUN_MARINEANALLETKF="${launcher} -n ${npe_marineanalletkf} --cpus-per-task=${NTHREADS_MARINEANALLETKF}"
 
 elif [[ "${step}" = "anal" ]] || [[ "${step}" = "analcalc" ]]; then
 
diff --git a/env/ORION.env b/env/ORION.env
index 502e99e1928..c203acae485 100755
--- a/env/ORION.env
+++ b/env/ORION.env
@@ -148,13 +148,13 @@ elif [[ "${step}" = "ocnanalecen" ]]; then
     [[ ${NTHREADS_OCNANALECEN} -gt ${nth_max} ]] && export NTHREADS_OCNANALECEN=${nth_max}
     export APRUN_OCNANALECEN="${launcher} -n ${npe_ocnanalecen} --cpus-per-task=${NTHREADS_OCNANALECEN}"
 
-elif [[ "${step}" = "ocnanalletkf" ]]; then
+elif [[ "${step}" = "marineanalletkf" ]]; then
 
-    nth_max=$((npe_node_max / npe_node_ocnanalletkf))
+    nth_max=$((npe_node_max / npe_node_marineanalletkf))
 
-    export NTHREADS_OCNANALLETKF=${nth_ocnanalletkf:-${nth_max}}
-    [[ ${NTHREADS_OCNANALLETKF} -gt ${nth_max} ]] && export NTHREADS_OCNANALLETKF=${nth_max}
-    export APRUN_OCNANALLETKF="${launcher} -n ${npe_ocnanalletkf} --cpus-per-task=${NTHREADS_OCNANALLETKF}"
+    export NTHREADS_MARINEANALLETKF=${nth_marineanalletkf:-${nth_max}}
+    [[ ${NTHREADS_MARINEANALLETKF} -gt ${nth_max} ]] && export NTHREADS_MARINEANALLETKF=${nth_max}
+    export APRUN_MARINEANALLETKF="${launcher} -n ${npe_marineanalletkf} --cpus-per-task=${NTHREADS_MARINEANALLETKF}"
 
 elif [[ "${step}" = "anal" ]] || [[ "${step}" = "analcalc" ]]; then
 
diff --git a/jobs/JGDAS_GLOBAL_OCEAN_ANALYSIS_LETKF b/jobs/JGLOBAL_MARINE_ANALYSIS_LETKF
similarity index 82%
rename from jobs/JGDAS_GLOBAL_OCEAN_ANALYSIS_LETKF
rename to jobs/JGLOBAL_MARINE_ANALYSIS_LETKF
index d03ddfc19ac..38dc3049f90 100755
--- a/jobs/JGDAS_GLOBAL_OCEAN_ANALYSIS_LETKF
+++ b/jobs/JGLOBAL_MARINE_ANALYSIS_LETKF
@@ -1,6 +1,6 @@
 #!/bin/bash
 source "${HOMEgfs}/ush/preamble.sh"
-source "${HOMEgfs}/ush/jjob_header.sh" -e "ocnanalletkf" -c "base ocnanal ocnanalletkf"
+source "${HOMEgfs}/ush/jjob_header.sh" -e "marineanalletkf" -c "base ocnanal marineanalletkf"
 
 ##############################################
 # Set variables used in the script
@@ -13,8 +13,10 @@ gPDY=${GDATE:0:8}
 gcyc=${GDATE:8:2}
 
 YMD=${gPDY} HH=${gcyc} declare_from_tmpl -rx \
-   COM_OCEAN_HISTORY_PREV:COM_OCEAN_HISTORY_TMPL \
-   COM_ICE_HISTORY_PREV:COM_ICE_HISTORY_TMPL
+   COMIN_OCEAN_HISTORY_PREV:COM_OCEAN_HISTORY_TMPL \
+   COMIN_ICE_HISTORY_PREV:COM_ICE_HISTORY_TMPL
+
+YMD=${PDY} HH=${cyc} declare_from_tmpl -rx COMIN_OBS:COM_OBS_TMPL 
 
 ##############################################
 # Begin JOB SPECIFIC work
diff --git a/jobs/rocoto/ocnanalletkf.sh b/jobs/rocoto/marineanalletkf.sh
similarity index 87%
rename from jobs/rocoto/ocnanalletkf.sh
rename to jobs/rocoto/marineanalletkf.sh
index f710be57104..f2bfb9f70c3 100755
--- a/jobs/rocoto/ocnanalletkf.sh
+++ b/jobs/rocoto/marineanalletkf.sh
@@ -8,7 +8,7 @@ source "${HOMEgfs}/ush/preamble.sh"
 status=$?
 [[ ${status} -ne 0 ]] && exit "${status}"
 
-export job="ocnanalletkf"
+export job="marineanalletkf"
 export jobid="${job}.$$"
 
 ###############################################################
@@ -18,6 +18,6 @@ export PYTHONPATH
 
 ###############################################################
 # Execute the JJOB
-"${HOMEgfs}/jobs/JGDAS_GLOBAL_OCEAN_ANALYSIS_LETKF"
+"${HOMEgfs}/jobs/JGLOBAL_MARINE_ANALYSIS_LETKF"
 status=$?
 exit "${status}"
diff --git a/parm/config/gfs/config.marineanalletkf b/parm/config/gfs/config.marineanalletkf
new file mode 100644
index 00000000000..fde3433a13d
--- /dev/null
+++ b/parm/config/gfs/config.marineanalletkf
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+########## config.marineanalletkf ##########
+# Ocn Analysis specific
+
+echo "BEGIN: config.marineanalletkf"
+
+# Get task specific resources
+. "${EXPDIR}/config.resources" marineanalletkf
+
+export MARINE_LETKF_EXEC="${JEDI_BIN}/gdas.x"
+export MARINE_LETKF_YAML_TMPL="${PARMgfs}/gdas/soca/letkf/letkf.yaml.j2"
+export MARINE_LETKF_STAGE_YAML_TMPL="${PARMgfs}/gdas/soca/letkf/letkf_stage.yaml.j2"
+
+export GRIDGEN_EXEC="${JEDI_BIN}/gdas_soca_gridgen.x"
+export GRIDGEN_YAML="${PARMgfs}/gdas/soca/gridgen/gridgen.yaml"
+
+echo "END: config.marineanalletkf"
diff --git a/parm/config/gfs/config.ocnanal b/parm/config/gfs/config.ocnanal
index 38a6cbd52a6..367e570ec8f 100644
--- a/parm/config/gfs/config.ocnanal
+++ b/parm/config/gfs/config.ocnanal
@@ -16,8 +16,8 @@ export SOCA_NINNER=@SOCA_NINNER@
 export CASE_ANL=@CASE_ANL@
 export DOMAIN_STACK_SIZE=116640000  #TODO: Make the stack size resolution dependent
 export JEDI_BIN=${HOMEgfs}/sorc/gdas.cd/build/bin
-
-export COMIN_OBS=@COMIN_OBS@
+export SOCA_FIX_STAGE_YAML_TMPL="${PARMgfs}/gdas/soca/soca_fix_stage.yaml.j2"
+export SOCA_ENS_BKG_STAGE_YAML_TMPL="${PARMgfs}/gdas/soca/soca_ens_bkg_stage.yaml.j2"
 
 # NICAS
 export NICAS_RESOL=@NICAS_RESOL@
diff --git a/parm/config/gfs/config.ocnanalletkf b/parm/config/gfs/config.ocnanalletkf
deleted file mode 100644
index b67f37152e7..00000000000
--- a/parm/config/gfs/config.ocnanalletkf
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-########## config.ocnanalletkf ##########
-# Ocn Analysis specific
-
-echo "BEGIN: config.ocnanalletkf"
-
-# Get task specific resources
-. "${EXPDIR}/config.resources" ocnanalletkf
-
-echo "END: config.ocnanalletkf"
diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources
index 5c3a1008801..e16524ecd3c 100644
--- a/parm/config/gfs/config.resources
+++ b/parm/config/gfs/config.resources
@@ -25,7 +25,7 @@ if (( $# != 1 )); then
     echo "waveinit waveprep wavepostsbs wavepostbndpnt wavepostbndpntbll wavepostpnt"
     echo "wavegempak waveawipsbulls waveawipsgridded"
     echo "postsnd awips gempak npoess"
-    echo "ocnanalprep prepoceanobs ocnanalbmat ocnanalrun ocnanalecen ocnanalletkf ocnanalchkpt ocnanalpost ocnanalvrfy"
+    echo "ocnanalprep prepoceanobs ocnanalbmat ocnanalrun ocnanalecen marineanalletkf ocnanalchkpt ocnanalpost ocnanalvrfy"
     exit 1
 
 fi
@@ -557,32 +557,32 @@ case ${step} in
     export memory_ocnanalecen
     ;;
 
-  "ocnanalletkf")
+  "marineanalletkf")
     npes=16
     case ${OCNRES} in
       "025")
         npes=480
-        memory_ocnanalletkf="96GB"
+        memory_marineanalletkf="96GB"
         ;;
       "050")
         npes=16
-        memory_ocnanalletkf="96GB"
+        memory_marineanalletkf="96GB"
         ;;
       "500")
         npes=16
-        memory_ocnanalletkf="24GB"
+        memory_marineanalletkf="24GB"
         ;;
       *)
         echo "FATAL ERROR: Resources not defined for job ${step} at resolution ${OCNRES}"
         exit 4
     esac
 
-    export wtime_ocnanalletkf="00:10:00"
-    export npe_ocnanalletkf=${npes}
-    export nth_ocnanalletkf=1
+    export wtime_marineanalletkf="00:10:00"
+    export npe_marineanalletkf=${npes}
+    export nth_marineanalletkf=1
     export is_exclusive=True
-    export npe_node_ocnanalletkf=$(( npe_node_max / nth_ocnanalletkf ))
-    export memory_ocnanalletkf
+    export npe_node_marineanalletkf=$(( npe_node_max / nth_marineanalletkf ))
+    export memory_marineanalletkf
     ;;
 
 
diff --git a/ush/python/pygfs/task/marine_letkf.py b/ush/python/pygfs/task/marine_letkf.py
index 0ae5bea98d1..0fdd3d9aba5 100644
--- a/ush/python/pygfs/task/marine_letkf.py
+++ b/ush/python/pygfs/task/marine_letkf.py
@@ -1,11 +1,16 @@
 #!/usr/bin/env python3
 
+import f90nml
 from logging import getLogger
+import os
 from pygfs.task.analysis import Analysis
 from typing import Dict
-from wxflow import (chdir,
+from wxflow import (AttrDict,
+                    FileHandler,
                     logit,
-                    Task)
+                    parse_j2yaml,
+                    to_timedelta,
+                    to_YMDH)
 
 logger = getLogger(__name__.split('.')[-1])
 
@@ -30,6 +35,21 @@ def __init__(self, config: Dict) -> None:
         logger.info("init")
         super().__init__(config)
 
+        _half_assim_freq = to_timedelta(f"{self.task_config.assim_freq}H") / 2
+        _letkf_yaml_file = 'letkf.yaml'
+        _letkf_exec_args = [self.task_config.MARINE_LETKF_EXEC,
+                            'soca',
+                            'localensembleda',
+                            _letkf_yaml_file]
+
+        self.task_config.WINDOW_MIDDLE = self.task_config.current_cycle
+        self.task_config.WINDOW_BEGIN = self.task_config.current_cycle - _half_assim_freq
+        self.task_config.letkf_exec_args = _letkf_exec_args
+        self.task_config.letkf_yaml_file = _letkf_yaml_file
+        self.task_config.mom_input_nml_tmpl = os.path.join(self.task_config.DATA, 'mom_input.nml.tmpl')
+        self.task_config.mom_input_nml = os.path.join(self.task_config.DATA, 'mom_input.nml')
+        self.task_config.obs_dir = os.path.join(self.task_config.DATA, 'obs')
+
     @logit(logger)
     def initialize(self):
         """Method initialize for ocean and sea ice LETKF task
@@ -43,6 +63,63 @@ def initialize(self):
 
         logger.info("initialize")
 
+        # make directories and stage ensemble background files
+        ensbkgconf = AttrDict()
+        keys = ['previous_cycle', 'current_cycle', 'DATA', 'NMEM_ENS',
+                'PARMgfs', 'ROTDIR', 'COM_OCEAN_HISTORY_TMPL', 'COM_ICE_HISTORY_TMPL']
+        for key in keys:
+            ensbkgconf[key] = self.task_config[key]
+        ensbkgconf.RUN = 'enkfgdas'
+        soca_ens_bkg_stage_list = parse_j2yaml(self.task_config.SOCA_ENS_BKG_STAGE_YAML_TMPL, ensbkgconf)
+        FileHandler(soca_ens_bkg_stage_list).sync()
+        soca_fix_stage_list = parse_j2yaml(self.task_config.SOCA_FIX_STAGE_YAML_TMPL, self.task_config)
+        FileHandler(soca_fix_stage_list).sync()
+        letkf_stage_list = parse_j2yaml(self.task_config.MARINE_LETKF_STAGE_YAML_TMPL, self.task_config)
+        FileHandler(letkf_stage_list).sync()
+
+        obs_list = parse_j2yaml(self.task_config.OBS_YAML, self.task_config)
+
+        # get the list of observations
+        obs_files = []
+        for ob in obs_list['observers']:
+            obs_name = ob['obs space']['name'].lower()
+            obs_filename = f"{self.task_config.RUN}.t{self.task_config.cyc}z.{obs_name}.{to_YMDH(self.task_config.current_cycle)}.nc"
+            obs_files.append((obs_filename, ob))
+
+        obs_files_to_copy = []
+        obs_to_use = []
+        # copy obs from COMIN_OBS to DATA/obs
+        for obs_file, ob in obs_files:
+            obs_src = os.path.join(self.task_config.COMIN_OBS, obs_file)
+            obs_dst = os.path.join(self.task_config.DATA, self.task_config.obs_dir, obs_file)
+            if os.path.exists(obs_src):
+                obs_files_to_copy.append([obs_src, obs_dst])
+                obs_to_use.append(ob)
+            else:
+                logger.warning(f"{obs_file} is not available in {self.task_config.COMIN_OBS}")
+
+        # stage the desired obs files
+        FileHandler({'copy': obs_files_to_copy}).sync()
+
+        # make the letkf.yaml
+        letkfconf = AttrDict()
+        keys = ['WINDOW_BEGIN', 'WINDOW_MIDDLE', 'RUN', 'gcyc', 'NMEM_ENS']
+        for key in keys:
+            letkfconf[key] = self.task_config[key]
+        letkfconf.RUN = 'enkfgdas'
+        letkf_yaml = parse_j2yaml(self.task_config.MARINE_LETKF_YAML_TMPL, letkfconf)
+        letkf_yaml.observations.observers = obs_to_use
+        letkf_yaml.save(self.task_config.letkf_yaml_file)
+
+        # swap date and stack size in mom_input.nml
+        domain_stack_size = self.task_config.DOMAIN_STACK_SIZE
+        ymdhms = [int(s) for s in self.task_config.WINDOW_BEGIN.strftime('%Y,%m,%d,%H,%M,%S').split(',')]
+        with open(self.task_config.mom_input_nml_tmpl, 'r') as nml_file:
+            nml = f90nml.read(nml_file)
+            nml['ocean_solo_nml']['date_init'] = ymdhms
+            nml['fms_nml']['domains_stack_size'] = int(domain_stack_size)
+            nml.write(self.task_config.mom_input_nml, force=True)  # force to overwrite if necessary
+
     @logit(logger)
     def run(self):
         """Method run for ocean and sea ice LETKF task
@@ -56,8 +133,6 @@ def run(self):
 
         logger.info("run")
 
-        chdir(self.runtime_config.DATA)
-
     @logit(logger)
     def finalize(self):
         """Method finalize for ocean and sea ice LETKF task

From 4968f3a8de9a5f90651cacd74e38f97bc80b7bbb Mon Sep 17 00:00:00 2001
From: TerrenceMcGuinness-NOAA <terrence.mcguinness@cox.net>
Date: Thu, 11 Jul 2024 17:48:47 +0000
Subject: [PATCH 2/3] CI maintenance updates and adding CI Unit Tests (#2740)

This PR has a few maintenance updates to the CI pipeline and adds a test
directory with Unit Tests
**Major Maintenance updates:**
- Added try blocks with appropriate messaging to GitHub PR of failure
for:
- - **scm** checkout
- - build fail (with error logs sent as gists)
- - create experiment fails with `stderr` sent to GitHub PR messaging
- Pre-stage FAILS from the above are now captured these fails allow
FINALIZE to update the label to FAIL (i.e. no more "hanging" CI state
labels in GitHub - see image below)
**Minor Maintenance updates:**
- Fix for STALLED cases reviled from PR 2700 (just needed a lambda
specifier)
- Fixed path to experiment directory in PR message (had dropped EXPDIR
in path)
- Needed `latin-1` decoder in reading log files for publishing
**Added python Unit Tests for CI functionality:**
- Installed **Rocoto** and **wxfow** in GitHub Runner for testing key CI
utility codes
- Cashed the install of Rocoto in the GitHub Runners to greatly reduce
stetup time for running the unit tests
- Unit Tests Python scripts added
- `test_rocostat.py`: rocoto_statcount() rocoto_summary()
rocoto_stalled()
  - `test_setup.py`: setup_expt() test_setup_xml()
  - `test_create_experment`: test_create_experiment()
  - - Runs all PR cases that do not have ICs in the GItHub Runner
- Reporting mechanism in the Actions tab for Python Unit Testing results
- Test case data for STALLED and RUNNING stored on S3 and pulled using
wget during runtime of tests
---
 .github/workflows/ci_unit_tests.yaml       | 64 +++++++++++++++
 ci/Jenkinsfile                             | 29 ++++---
 ci/cases/yamls/gefs_ci_defaults.yaml       |  2 +-
 ci/scripts/tests/test_create_experiment.py | 29 +++++++
 ci/scripts/tests/test_rocotostat.py        | 90 ++++++++++++++++++++++
 ci/scripts/tests/test_setup.py             | 89 +++++++++++++++++++++
 ci/scripts/utils/publish_logs.py           |  6 +-
 ci/scripts/utils/rocotostat.py             | 31 +++++++-
 sorc/wxflow                                |  2 +-
 9 files changed, 327 insertions(+), 15 deletions(-)
 create mode 100644 .github/workflows/ci_unit_tests.yaml
 create mode 100644 ci/scripts/tests/test_create_experiment.py
 create mode 100755 ci/scripts/tests/test_rocotostat.py
 create mode 100755 ci/scripts/tests/test_setup.py

diff --git a/.github/workflows/ci_unit_tests.yaml b/.github/workflows/ci_unit_tests.yaml
new file mode 100644
index 00000000000..e22f63bf566
--- /dev/null
+++ b/.github/workflows/ci_unit_tests.yaml
@@ -0,0 +1,64 @@
+name: CI Unit Tests
+on: [pull_request, push, workflow_dispatch]
+
+jobs:
+
+  ci_pytest:
+      runs-on: ubuntu-latest
+      name: Run unit tests on CI system      
+      permissions:
+        checks: write
+
+      steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11.8
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y perl libxml-libxml-perl libxml-libxslt-perl libdatetime-perl
+          python -m pip install --upgrade pip
+          pip install pytest
+          pip install wxflow
+          pip install wget
+
+      - name: Cache Rocoto Install
+        uses: actions/cache@v4
+        with:
+          path: ~/rocoto
+          key: ${{ runner.os }}-rocoto-${{ hashFiles('**/ci-unit_tests.yaml') }}
+
+      - name: Install Rocoto
+        run: |
+          if [ ! -d "$HOME/rocoto/bin" ]; then
+            git clone https://github.com/christopherwharrop/rocoto.git $HOME/rocoto
+            cd $HOME/rocoto
+            ./INSTALL
+          fi
+          echo "$HOME/rocoto/bin" >> $GITHUB_PATH
+
+      - name: Run tests
+        shell: bash
+        run: |
+          sudo mkdir -p /scratch1/NCEPDEV
+          cd $GITHUB_WORKSPACE/sorc
+          git submodule update --init --recursive
+          ./link_workflow.sh
+          cd $GITHUB_WORKSPACE/ci/scripts/tests
+          ln -s ../wxflow
+
+          pytest -v --junitxml $GITHUB_WORKSPACE/ci/scripts/tests/test-results.xml
+
+
+      - name: Publish Test Results
+        if: always()
+        uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          files: ci/scripts/tests/test-results.xml
+          job_summary: true
+          comment_mode: off
diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile
index 956bd692dd5..05d38b7898c 100644
--- a/ci/Jenkinsfile
+++ b/ci/Jenkinsfile
@@ -14,7 +14,7 @@ pipeline {
 
     options {
         skipDefaultCheckout()
-        //parallelsAlwaysFailFast()
+        parallelsAlwaysFailFast()
     }
 
     stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR
@@ -90,9 +90,6 @@ pipeline {
         stage('3. Build System') {
             matrix {
                 agent { label NodeName[machine].toLowerCase() }
-                //options {
-                //    throttle(['global_matrix_build'])
-                //}
                 axes {
                     axis {
                         name 'system'
@@ -102,6 +99,7 @@ pipeline {
                 stages {
                     stage('build system') {
                         steps {
+                           catchError(buildResult: 'UNSTABLE', stageResult: 'FAILURE') {
                             script {
                                 def HOMEgfs = "${CUSTOM_WORKSPACE}/${system}" // local HOMEgfs is used to build the system on per system basis under the custome workspace for each buile system
                                 sh(script: "mkdir -p ${HOMEgfs}")
@@ -120,8 +118,8 @@ pipeline {
                                             if (env.CHANGE_ID) {
                                                 sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Checkout **Failed** on ${Machine}: ${e.getMessage()}" """)
                                             }
-                                            echo "Failed to checkout: ${e.getMessage()}"
                                             STATUS = 'Failed'
+                                            error("Failed to checkout: ${e.getMessage()}")
                                         }
                                         def gist_url = ""
                                         def error_logs = ""
@@ -155,6 +153,7 @@ pipeline {
                                                     } catch (Exception error_comment) {
                                                         echo "Failed to comment on PR: ${error_comment.getMessage()}"
                                                     }
+                                                    STATUS = 'Failed'
                                                     error("Failed to build system on ${Machine}")
                                                 }
                                             }
@@ -174,6 +173,7 @@ pipeline {
                                     }
                                }
                            }
+                          }
                         }
                     }
                 }
@@ -181,7 +181,9 @@ pipeline {
         }
 
         stage('4. Run Tests') {
-            failFast false 
+            when {
+                expression { STATUS != 'Failed' }
+            }
             matrix {
                 agent { label NodeName[machine].toLowerCase() }
                 axes {
@@ -198,14 +200,21 @@ pipeline {
                             expression { return caseList.contains(Case) }
                         }
                         steps {
+                            catchError(buildResult: 'UNSTABLE', stageResult: 'FAILURE') {
                                 script {
                                     sh(script: "sed -n '/{.*}/!p' ${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml > ${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml.tmp")
                                     def yaml_case = readYaml file: "${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml.tmp"
                                     system = yaml_case.experiment.system
                                     def HOMEgfs = "${CUSTOM_WORKSPACE}/${system}"   // local HOMEgfs is used to populate the XML on per system basis
                                     env.RUNTESTS = "${CUSTOM_WORKSPACE}/RUNTESTS"
-                                    sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml")
+                                    try {
+                                        error_output = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml", returnStdout: true).trim()
+                                    } catch (Exception error_create) {
+                                        sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "${Case} **FAILED** to create experment on ${Machine}\n with the error:\n\\`\\`\\`\n${error_output}\\`\\`\\`" """)
+                                        error("Case ${Case} failed to create experment directory")
+                                    }
                                 }
+                            }
                         }
                     }
 
@@ -213,7 +222,6 @@ pipeline {
                         when {
                             expression { return caseList.contains(Case) }
                         }
-                        failFast false
                         steps {
                             script {
                                 HOMEgfs = "${CUSTOM_WORKSPACE}/gfs"  // common HOMEgfs is used to launch the scripts that run the experiments
@@ -255,11 +263,11 @@ pipeline {
                                         STATUS = 'Failed'
                                         try {
                                             sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Running" --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true)
-                                            sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine} in\n\\`${CUSTOM_WORKSPACE}/RUNTESTS/${pslot}\\`" """)
+                                            sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine} in\n\\`${CUSTOM_WORKSPACE}/RUNTESTS/EXPDIR/${pslot}\\`" """)
                                         } catch (Exception e) {
                                             echo "Failed to update label from Running to ${STATUS}: ${e.getMessage()}"
                                         }
-                                        error("Failed to run experiments ${Case} on ${Machine}")
+                                        echo "Failed to run experiments ${Case} on ${Machine}"
                                     }
                                 }
                             }
@@ -268,6 +276,7 @@ pipeline {
                 }
             }
         }
+
         stage( '5. FINALIZE' ) {
             agent { label NodeName[machine].toLowerCase() }
             steps {
diff --git a/ci/cases/yamls/gefs_ci_defaults.yaml b/ci/cases/yamls/gefs_ci_defaults.yaml
index ceb36d4acb7..05a97edd900 100644
--- a/ci/cases/yamls/gefs_ci_defaults.yaml
+++ b/ci/cases/yamls/gefs_ci_defaults.yaml
@@ -1,4 +1,4 @@
 defaults:
   !INC {{ HOMEgfs }}/parm/config/gefs/yaml/defaults.yaml
 base:
-  HPC_ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
+  ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
diff --git a/ci/scripts/tests/test_create_experiment.py b/ci/scripts/tests/test_create_experiment.py
new file mode 100644
index 00000000000..03f3a30805d
--- /dev/null
+++ b/ci/scripts/tests/test_create_experiment.py
@@ -0,0 +1,29 @@
+from wxflow import Executable
+from shutil import rmtree
+import os
+import copy
+
+_here = os.path.dirname(__file__)
+HOMEgfs = os.sep.join(_here.split(os.sep)[:-3])
+RUNDIR = os.path.join(_here, 'testdata/RUNDIR')
+
+
+def test_create_experiment():
+
+    create_experiment_script = Executable(f'{HOMEgfs}/workflow/create_experiment.py')
+    yaml_dir = yaml_dir = os.path.join(HOMEgfs, 'ci/cases/pr')
+    env = os.environ.copy()
+    env['RUNTESTS'] = RUNDIR
+
+    for case in os.listdir(yaml_dir):
+        if case.endswith('.yaml'):
+            with open(os.path.join(yaml_dir, case), 'r') as file:
+                file_contents = file.read()
+                if 'ICSDIR_ROOT' not in file_contents:
+                    create_experiment = copy.deepcopy(create_experiment_script)
+                    create_experiment.add_default_arg(['-y', f'../../cases/pr/{case}', '--overwrite'])
+                    env['pslot'] = os.path.splitext(case)[0]
+                    create_experiment(env=env)
+                    assert (create_experiment.returncode == 0)
+
+    rmtree(RUNDIR)
diff --git a/ci/scripts/tests/test_rocotostat.py b/ci/scripts/tests/test_rocotostat.py
new file mode 100755
index 00000000000..f43f8df2f84
--- /dev/null
+++ b/ci/scripts/tests/test_rocotostat.py
@@ -0,0 +1,90 @@
+import sys
+import os
+from shutil import rmtree
+import wget
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(os.path.dirname(script_dir), 'utils'))
+
+from rocotostat import rocoto_statcount, rocotostat_summary, is_done, is_stalled, CommandNotFoundError
+from wxflow import which
+
+test_data_url = 'https://noaa-nws-global-pds.s3.amazonaws.com/data/CI/'
+
+testdata_path = 'testdata/rocotostat'
+testdata_full_path = os.path.join(script_dir, testdata_path)
+
+
+if not os.path.isfile(os.path.join(testdata_full_path, 'database.db')):
+    os.makedirs(testdata_full_path, exist_ok=True)
+    workflow_url = test_data_url + str(testdata_path) + '/workflow.xml'
+    workflow_destination = os.path.join(testdata_full_path, 'workflow.xml')
+    wget.download(workflow_url, workflow_destination)
+
+    database_url = test_data_url + str(testdata_path) + '/database.db'
+    database_destination = os.path.join(testdata_full_path, 'database.db')
+    wget.download(database_url, database_destination)
+
+try:
+    rocotostat = which('rocotostat')
+except CommandNotFoundError:
+    raise CommandNotFoundError("rocotostat not found in PATH")
+
+rocotostat.add_default_arg(['-w', os.path.join(testdata_path, 'workflow.xml'), '-d', os.path.join(testdata_path, 'database.db')])
+
+
+def test_rocoto_statcount():
+
+    result = rocoto_statcount(rocotostat)
+
+    assert result['SUCCEEDED'] == 20
+    assert result['FAIL'] == 0
+    assert result['DEAD'] == 0
+    assert result['RUNNING'] == 0
+    assert result['SUBMITTING'] == 0
+    assert result['QUEUED'] == 0
+
+
+def test_rocoto_summary():
+
+    result = rocotostat_summary(rocotostat)
+
+    assert result['CYCLES_TOTAL'] == 1
+    assert result['CYCLES_DONE'] == 1
+
+
+def test_rocoto_done():
+
+    result = rocotostat_summary(rocotostat)
+
+    assert is_done(result)
+
+    rmtree(testdata_full_path)
+
+
+def test_rocoto_stalled():
+    testdata_path = 'testdata/rocotostat_stalled'
+    testdata_full_path = os.path.join(script_dir, testdata_path)
+    xml = os.path.join(testdata_full_path, 'stalled.xml')
+    db = os.path.join(testdata_full_path, 'stalled.db')
+
+    if not os.path.isfile(os.path.join(testdata_full_path, 'stalled.db')):
+        os.makedirs(testdata_full_path, exist_ok=True)
+        workflow_url = test_data_url + str(testdata_path) + '/stalled.xml'
+        database_url = test_data_url + str(testdata_path) + '/stalled.db'
+
+        workflow_destination = os.path.join(testdata_full_path, 'stalled.xml')
+        wget.download(workflow_url, workflow_destination)
+
+        database_destination = os.path.join(testdata_full_path, 'stalled.db')
+        wget.download(database_url, database_destination)
+
+    rocotostat = which('rocotostat')
+    rocotostat.add_default_arg(['-w', xml, '-d', db])
+
+    result = rocoto_statcount(rocotostat)
+
+    assert result['SUCCEEDED'] == 11
+    assert is_stalled(result)
+
+    rmtree(testdata_full_path)
diff --git a/ci/scripts/tests/test_setup.py b/ci/scripts/tests/test_setup.py
new file mode 100755
index 00000000000..77a36369f46
--- /dev/null
+++ b/ci/scripts/tests/test_setup.py
@@ -0,0 +1,89 @@
+from wxflow import Executable, Configuration, ProcessError
+from shutil import rmtree
+import pytest
+import os
+
+_here = os.path.dirname(__file__)
+HOMEgfs = os.sep.join(_here.split(os.sep)[:-3])
+RUNDIR = os.path.join(_here, 'testdata/RUNDIR')
+pslot = "C48_ATM"
+account = "fv3-cpu"
+foobar = "foobar"
+
+
+def test_setup_expt():
+
+    arguments = [
+        "gfs", "forecast-only",
+        "--pslot", pslot, "--app", "ATM", "--resdetatmos", "48",
+        "--comroot", f"{RUNDIR}", "--expdir", f"{RUNDIR}",
+        "--idate", "2021032312", "--edate", "2021032312", "--overwrite"
+    ]
+    setup_expt_script = Executable(os.path.join(HOMEgfs, "workflow", "setup_expt.py"))
+    setup_expt_script.add_default_arg(arguments)
+    setup_expt_script()
+    assert (setup_expt_script.returncode == 0)
+
+
+def test_setup_xml():
+
+    setup_xml_script = Executable(os.path.join(HOMEgfs, "workflow/setup_xml.py"))
+    setup_xml_script.add_default_arg(f"{RUNDIR}/{pslot}")
+    setup_xml_script()
+    assert (setup_xml_script.returncode == 0)
+
+    cfg = Configuration(f"{RUNDIR}/{pslot}")
+    base = cfg.parse_config('config.base')
+    assert base.ACCOUNT == account
+
+    assert "UNKNOWN" not in base.values()
+
+    with open(f"{RUNDIR}/{pslot}/{pslot}.xml", 'r') as file:
+        contents = file.read()
+    assert contents.count(account) > 5
+
+    rmtree(RUNDIR)
+
+
+def test_setup_xml_fail_config_env_cornercase():
+
+    script_content = ('''#!/usr/bin/env bash
+export HOMEgfs=foobar
+../../../workflow/setup_xml.py "${1}"\n
+''')
+
+    with open('run_setup_xml.sh', 'w') as file:
+        file.write(script_content)
+    os.chmod('run_setup_xml.sh', 0o755)
+
+    try:
+        setup_xml_script = Executable(os.path.join(HOMEgfs, "ci", "scripts", "tests", "run_setup_xml.sh"))
+        setup_xml_script.add_default_arg(f"{RUNDIR}/{pslot}")
+        setup_xml_script()
+        assert (setup_xml_script.returncode == 0)
+
+        cfg = Configuration(f"{RUNDIR}/{pslot}")
+        base = cfg.parse_config('config.base')
+        assert base.ACCOUNT == account
+
+        assert foobar not in base.values()
+        assert "UNKNOWN" not in base.values()
+
+        with open(f"{RUNDIR}/{pslot}/{pslot}.xml", 'r') as file:
+            contents = file.read()
+        assert contents.count(account) > 5
+
+    except ProcessError as e:
+        # We expect this fail becuse ACCOUNT=fv3-cpu in config.base and environment
+        pass
+    except Exception as e:
+        # If an exception occurs, pass the test with a custom message
+        pytest.fail(f"Expected exception occurred: {e}")
+
+    finally:
+        # Cleanup code to ensure it runs regardless of test outcome
+        os.remove('run_setup_xml.sh')
+        try:
+            rmtree(RUNDIR)
+        except FileNotFoundError:
+            pass
diff --git a/ci/scripts/utils/publish_logs.py b/ci/scripts/utils/publish_logs.py
index 7768c17c100..283c84a8d1c 100755
--- a/ci/scripts/utils/publish_logs.py
+++ b/ci/scripts/utils/publish_logs.py
@@ -46,7 +46,8 @@ def add_logs_to_gist(args, emcbot_gh):
 
     gist_files = {}
     for file in args.file:
-        file_content = file.read()
+        with open(file.name, 'r', encoding='latin-1') as file:
+            file_content = file.read()
         gist_files[os.path.basename(file.name)] = emcbot_gh.InputFileContent(file_content)
 
     gist = emcbot_gh.user.create_gist(public=True, files=gist_files, description=f"error log file from CI run {args.gist[0]}")
@@ -85,7 +86,8 @@ def upload_logs_to_repo(args, emcbot_gh, emcbot_ci_url):
             break
 
     for file in args.file:
-        file_content = file.read()
+        with open(file.name, 'r', encoding='latin-1') as file:
+            file_content = file.read()
         file_path_in_repo = f"{repo_path}/{path_header}/" + str(os.path.basename(file.name))
         emcbot_gh.repo.create_file(file_path_in_repo, "Adding error log file", file_content, branch="error_logs")
 
diff --git a/ci/scripts/utils/rocotostat.py b/ci/scripts/utils/rocotostat.py
index 9b1d8dcc3af..70c672f0e8e 100755
--- a/ci/scripts/utils/rocotostat.py
+++ b/ci/scripts/utils/rocotostat.py
@@ -14,6 +14,35 @@
 
 
 def attempt_multiple_times(expression, max_attempts, sleep_duration=0, exception_class=Exception):
+    """
+    Retries a function multiple times.
+
+    Try to execute the function expression up to max_attempts times ignoring any exceptions
+    of the type exception_class, It waits for sleep_duration seconds between attempts.
+
+    Parameters
+    ----------
+    expression : callable
+        The function to be executed.
+    max_attempts : int
+        The maximum number of attempts to execute the function.
+    sleep_duration : int, optional
+        The number of seconds to wait between attempts. Default is 0.
+    exception_class : Exception, optional
+        The type of exception to catch. Default is the base Exception class, catching all exceptions.
+
+    Returns
+    -------
+    The return value of the function expression.
+
+    Raises
+    ------
+    exception_class
+        If the function expression raises an exception of type exception_class
+        in all max_attempts attempts.
+
+    """
+
     attempt = 0
     last_exception = None
     while attempt < max_attempts:
@@ -189,7 +218,7 @@ def is_stalled(rocoto_status):
         error_return = rocoto_status['UNKNOWN']
         rocoto_state = 'UNKNOWN'
     elif is_stalled(rocoto_status):
-        rocoto_status = attempt_multiple_times(rocoto_statcount(rocotostat), 2, 120, ProcessError)
+        rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError)
         if is_stalled(rocoto_status):
             error_return = 3
             rocoto_state = 'STALLED'
diff --git a/sorc/wxflow b/sorc/wxflow
index 5dad7dd61ce..1356acdb2bb 160000
--- a/sorc/wxflow
+++ b/sorc/wxflow
@@ -1 +1 @@
-Subproject commit 5dad7dd61cebd9b3f2b163b3b06bb75eae1860a9
+Subproject commit 1356acdb2bbca28e442597699da1a295faa18fe3

From e8a2e0facc7ba249c1d74639a6eb7c9a44ea9b69 Mon Sep 17 00:00:00 2001
From: Wei Huang <Wei.Huang@noaa.gov>
Date: Fri, 12 Jul 2024 19:31:25 +0000
Subject: [PATCH 3/3] sync and remove gempak from noaacloud

---
 modulefiles/module_base.noaacloud.lua | 1 -
 sorc/gfs_utils.fd                     | 2 +-
 sorc/gsi_enkf.fd                      | 2 +-
 sorc/gsi_monitor.fd                   | 2 +-
 sorc/gsi_utils.fd                     | 2 +-
 sorc/ufs_utils.fd                     | 2 +-
 sorc/wxflow                           | 2 +-
 versions/run.noaacloud.ver            | 2 --
 workflow/rocoto/tasks.py              | 1 +
 9 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/modulefiles/module_base.noaacloud.lua b/modulefiles/module_base.noaacloud.lua
index 6d2182062e2..113409e41d9 100644
--- a/modulefiles/module_base.noaacloud.lua
+++ b/modulefiles/module_base.noaacloud.lua
@@ -9,7 +9,6 @@ load(pathJoin("stack-intel", (os.getenv("stack_intel_ver") or "None")))
 load(pathJoin("stack-intel-oneapi-mpi", (os.getenv("stack_impi_ver") or "None")))
 load(pathJoin("python", (os.getenv("python_ver") or "None")))
 
-load(pathJoin("gempak", (os.getenv("gempak_ver") or "None")))
 load(pathJoin("jasper", (os.getenv("jasper_ver") or "None")))
 load(pathJoin("libpng", (os.getenv("libpng_ver") or "None")))
 load(pathJoin("cdo", (os.getenv("cdo_ver") or "None")))
diff --git a/sorc/gfs_utils.fd b/sorc/gfs_utils.fd
index 02ce084c244..0cdc2795260 160000
--- a/sorc/gfs_utils.fd
+++ b/sorc/gfs_utils.fd
@@ -1 +1 @@
-Subproject commit 02ce084c244823e22661d493a50236b7d5eaf70a
+Subproject commit 0cdc2795260fc1b59e86a873729433a470794a97
diff --git a/sorc/gsi_enkf.fd b/sorc/gsi_enkf.fd
index 529bb796bea..8e279f9c734 160000
--- a/sorc/gsi_enkf.fd
+++ b/sorc/gsi_enkf.fd
@@ -1 +1 @@
-Subproject commit 529bb796bea0e490f186729cd168a91c034bb12d
+Subproject commit 8e279f9c734097f673b07e80f385b2623d13ba4a
diff --git a/sorc/gsi_monitor.fd b/sorc/gsi_monitor.fd
index e1f9f21af16..f9d6f5f7444 160000
--- a/sorc/gsi_monitor.fd
+++ b/sorc/gsi_monitor.fd
@@ -1 +1 @@
-Subproject commit e1f9f21af16ce912fdc2cd75c5b27094a550a0c5
+Subproject commit f9d6f5f744462a449e70abed8c5860b1c4564ad8
diff --git a/sorc/gsi_utils.fd b/sorc/gsi_utils.fd
index 9382fd01c2a..43328145294 160000
--- a/sorc/gsi_utils.fd
+++ b/sorc/gsi_utils.fd
@@ -1 +1 @@
-Subproject commit 9382fd01c2a626c8934c3f553d420a45de2b4dec
+Subproject commit 4332814529465ab8eb58e43a38227b952ebfca49
diff --git a/sorc/ufs_utils.fd b/sorc/ufs_utils.fd
index 3ef2e6bd725..2794d413d08 160000
--- a/sorc/ufs_utils.fd
+++ b/sorc/ufs_utils.fd
@@ -1 +1 @@
-Subproject commit 3ef2e6bd725d2662fd6ee95897cb7bac222e5144
+Subproject commit 2794d413d083b43d9ba37a15375d5c61b610d29e
diff --git a/sorc/wxflow b/sorc/wxflow
index 1356acdb2bb..5dad7dd61ce 160000
--- a/sorc/wxflow
+++ b/sorc/wxflow
@@ -1 +1 @@
-Subproject commit 1356acdb2bbca28e442597699da1a295faa18fe3
+Subproject commit 5dad7dd61cebd9b3f2b163b3b06bb75eae1860a9
diff --git a/versions/run.noaacloud.ver b/versions/run.noaacloud.ver
index c85d29a71a6..4c9ac3cd428 100644
--- a/versions/run.noaacloud.ver
+++ b/versions/run.noaacloud.ver
@@ -2,8 +2,6 @@ export stack_intel_ver=2021.3.0
 export stack_impi_ver=2021.3.0
 export spack_env=gsi-addon-env
 
-export gempak_ver=7.4.2
-
 source "${HOMEgfs:-}/versions/run.spack.ver"
 export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core"
 
diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py
index 785ced99002..fc47efb8002 100644
--- a/workflow/rocoto/tasks.py
+++ b/workflow/rocoto/tasks.py
@@ -241,6 +241,7 @@ def get_resource(self, task_name):
            #as below. Or, it won't run, but with an error:
            #"ufs_model.x: error while loading shared libraries: libiomp5.so: cannot open shared object file: No such file or directory"
            #Even the library path is clearly in LD_LIBRARY_PATH, or load exactly the modules when build ufs_model.x
+           #TODO: find a mechanism to provide native scheduler information.
             pw_csp = os.environ.get('PW_CSP', 'unknown')
             if ( pw_csp in ['aws', 'azure', 'google'] ):
                 native = '--export=ALL --exclusive'