Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add/fix build capability for Gaea-C5, Gaea-C6, and container #800

Merged
merged 13 commits into from
Nov 12, 2024
Merged
28 changes: 28 additions & 0 deletions modulefiles/gsi_container.intel.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
help([[
]])

prepend_path("MODULEPATH", "/opt/spack-stack/spack-stack-1.8.0/envs/unified-env/install/modulefiles/Core")

local stack_intel_ver=os.getenv("stack_intel_ver") or "2021.10.0"
local stack_impi_ver=os.getenv("stack_impi_ver") or "2021.12.2"
local cmake_ver=os.getenv("cmake_ver") or "3.27.9"
local prod_util_ver=os.getenv("prod_util_ver") or "2.1.1"

load(pathJoin("stack-intel", stack_intel_ver))
load(pathJoin("stack-intel-oneapi-mpi", stack_impi_ver))
load(pathJoin("cmake", cmake_ver))

load("gsi_common")
load(pathJoin("prod_util", prod_util_ver))

pushenv("CFLAGS", "-march=ivybridge")
pushenv("FFLAGS", "-march=ivybridge")

setenv("CC","mpiicc")
setenv("CXX","mpiicpc")
setenv("FC","mpiifort")
setenv("F90","mpiifort")
setenv("F77","mpiifort")
pushenv("USE_BUFR4", "YES")

whatis("Description: GSI environment in a container with Intel Compilers")
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ help([[
prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/spack-stack-1.6.0/envs/gsi-addon-dev/install/modulefiles/Core")

local stack_python_ver=os.getenv("stack_python_ver") or "3.11.6"
local stack_intel_ver=os.getenv("stack_intel_ver") or "2023.1.0"
local stack_cray_mpich_ver=os.getenv("stack_cray_mpich_ver") or "8.1.25"
local stack_intel_ver=os.getenv("stack_intel_ver") or "2023.2.0"
local stack_cray_mpich_ver=os.getenv("stack_cray_mpich_ver") or "8.1.28"
local cmake_ver=os.getenv("cmake_ver") or "3.23.1"
local prod_util_ver=os.getenv("prod_util_ver") or "2.1.1"

Expand All @@ -17,10 +17,6 @@ load(pathJoin("cmake", cmake_ver))
load("gsi_common")
load(pathJoin("prod_util", prod_util_ver))

local MKLROOT="/opt/intel/oneapi/mkl/2022.0.2/"
prepend_path("LD_LIBRARY_PATH",pathJoin(MKLROOT,"lib/intel64"))
pushenv("MKLROOT", MKLROOT)

pushenv("GSI_BINARY_SOURCE_DIR", "/gpfs/f5/ufs-ard/world-shared/GSI_data/fix/gsi/20240208")

setenv("CC","cc")
Expand All @@ -29,4 +25,4 @@ setenv("CXX","CC")
pushenv("CRAYPE_LINK_TYPE","dynamic")

unload("cray-libsci")
whatis("Description: GSI environment on Gaea with Intel Compilers")
whatis("Description: GSI environment on GaeaC5 with Intel Compilers")
29 changes: 29 additions & 0 deletions modulefiles/gsi_gaeac6.intel.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
help([[
]])

--prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/spack-stack-1.6.0/envs/gsi-addon-dev/install/modulefiles/Core")
prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/c6/spack-stack-1.6.0/envs/gsi-addon/install/modulefiles/Core")

local stack_python_ver=os.getenv("stack_python_ver") or "3.11.6"
local stack_intel_ver=os.getenv("stack_intel_ver") or "2023.2.0"
local stack_cray_mpich_ver=os.getenv("stack_cray_mpich_ver") or "8.1.29"
local cmake_ver=os.getenv("cmake_ver") or "3.23.1"
local prod_util_ver=os.getenv("prod_util_ver") or "2.1.1"

load(pathJoin("stack-intel", stack_intel_ver))
load(pathJoin("stack-cray-mpich", stack_cray_mpich_ver))
load(pathJoin("stack-python", stack_python_ver))
load(pathJoin("cmake", cmake_ver))

load("gsi_common")
load(pathJoin("prod_util", prod_util_ver))

pushenv("GSI_BINARY_SOURCE_DIR", "/gpfs/f6/bil-fire8/world-shared/GSI_data/fix/gsi/20240208")

setenv("CC","cc")
setenv("FC","ftn")
setenv("CXX","CC")
pushenv("CRAYPE_LINK_TYPE","dynamic")

unload("cray-libsci")
whatis("Description: GSI environment on GaeaC6 with Intel Compilers")
60 changes: 46 additions & 14 deletions regression/regression_param.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,16 @@ case $machine in
memnode=96
numcore=40
;;
Gaea)
sub_cmd="sub_gaea"
gaeac5)
sub_cmd="sub_gaeac5"
memnode=251
numcore=128
;;
gaeac6)
sub_cmd="sub_gaeac6"
memnode=384
numcore=192
;;
wcoss2)
sub_cmd="sub_wcoss2"
memnode=512
Expand Down Expand Up @@ -73,7 +78,10 @@ case $regtest in
elif [[ "$machine" = "Discover" ]]; then
topts[1]="0:30:00" ; popts[1]="48/2" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="60/3" ; ropts[2]="/2"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2"
elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then
Expand Down Expand Up @@ -103,9 +111,12 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:15:00" ; popts[1]="40/3/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="40/5/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:60:00" ; popts[1]="40/3/" ; ropts[1]="/1"
RussTreadon-NOAA marked this conversation as resolved.
Show resolved Hide resolved
topts[2]="0:60:00" ; popts[2]="40/5/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
Expand Down Expand Up @@ -133,7 +144,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:15:00" ; popts[1]="32/2/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="64/4/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then
Expand Down Expand Up @@ -162,7 +176,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:15:00" ; popts[1]="32/2/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="64/4/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then
Expand Down Expand Up @@ -192,9 +209,12 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="4/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="6/6/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
topts[1]="0:15:00" ; popts[1]="28/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="28/2/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:15:00" ; popts[1]="40/2/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="40/4/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:15:00" ; popts[1]="40/2/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="40/4/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="64/2/" ; ropts[2]="/1"
Expand Down Expand Up @@ -222,7 +242,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:30:00" ; popts[1]="6/12/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="8/12/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:30:00" ; popts[1]="14/8/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="14/14/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:30:00" ; popts[1]="14/8/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="14/14/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then
Expand Down Expand Up @@ -252,7 +275,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:10:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/5/" ; ropts[2]="/2"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:10:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/5/" ; ropts[2]="/2"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:10:00" ; popts[1]="16/2/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="16/4/" ; ropts[2]="/2"
elif [[ "$machine" = "wcoss2" || "$machine" = "acorn" ]]; then
Expand Down Expand Up @@ -315,7 +341,13 @@ elif [[ "$machine" = "Jet" ]]; then
export MPI_BUFS_PER_HOST=256
export MPI_GROUP_MAX=256
export APRUN="srun -n \$ntasks --cpus-per-task=\$threads"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
export OMP_STACKSIZE=1024M
export MPI_BUFS_PER_PROC=256
export MPI_BUFS_PER_HOST=256
export MPI_GROUP_MAX=256
export APRUN="srun --export=ALL -n \$ntasks"
elif [[ "$machine" = "gaeac6" ]]; then
export OMP_STACKSIZE=1024M
export MPI_BUFS_PER_PROC=256
export MPI_BUFS_PER_HOST=256
Expand Down
18 changes: 15 additions & 3 deletions regression/regression_var.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ elif [[ -d /mnt/lfs4 || -d /jetmon || -d /mnt/lfs5 ]]; then # Jet
export machine="Jet"
elif [[ -d /discover ]]; then # NCCS Discover
export machine="Discover"
elif [[ -d /ncrc ]]; then # Gaea
export machine="Gaea"
elif [[ -d /gpfs/f5 ]]; then # GaeaC5
export machine="gaeac5"
elif [[ -d /gpfs/f6 ]]; then # GaeaC6
export machine="gaeac6"
elif [[ -d /data/prod ]]; then # S4
export machine="S4"
elif [[ -d /work ]]; then # Orion or Hercules
Expand All @@ -57,7 +59,7 @@ fi
echo "Running Regression Tests on '$machine'";

case $machine in
Gaea)
gaeac5)
export queue="normal"
export group="ufs-ard"
export noscrub="/gpfs/f5/${group}/scratch/${USER}/$LOGNAME/gsi_tmp/noscrub"
Expand All @@ -67,6 +69,16 @@ case $machine in
export check_resource="no"
export accnt="ufs-ard"
;;
gaeac6)
export queue="normal"
export group="bil-fire8"
export noscrub="/gpfs/f6/${group}/scratch/${USER}/${LOGNAME}/gsi_tmp/noscrub"
export ptmp="/gpfs/f6/${group}/scratch/${USER}/${LOGNAME}/gsi_tmp/ptmp"
export casesdir="/gpfs/f6/bil-fire8/world-shared/GSI_data/CASES/regtest"

export check_resource="no"
export accnt="bil-fire8"
;;
wcoss2 | acorn)
export local_or_default="${local_or_default:-/lfs/h2/emc/da/noscrub/$LOGNAME}"
if [ -d $local_or_default ]; then
Expand Down
6 changes: 5 additions & 1 deletion src/gsi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,11 @@ target_link_libraries(gsi_fortran_obj PUBLIC nemsio::nemsio)
target_link_libraries(gsi_fortran_obj PUBLIC ncio::ncio)
target_link_libraries(gsi_fortran_obj PUBLIC w3emc::w3emc_d)
target_link_libraries(gsi_fortran_obj PUBLIC sp::sp_d)
target_link_libraries(gsi_fortran_obj PUBLIC bufr::bufr_d)
if(DEFINED ENV{USE_BUFR4})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks OK to me. Cross checking with @DavidHuber-NOAA . PR #791 upgrades to bufr/12.1.0. Not sure how the bufr logic added here might impact Dave's PR.

target_link_libraries(gsi_fortran_obj PUBLIC bufr::bufr_4)
else()
target_link_libraries(gsi_fortran_obj PUBLIC bufr::bufr_d)
endif()
target_link_libraries(gsi_fortran_obj PUBLIC crtm::crtm)
if(GSI_MODE MATCHES "Regional")
target_link_libraries(gsi_fortran_obj PUBLIC wrf_io::wrf_io)
Expand Down
21 changes: 15 additions & 6 deletions ush/detect_machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ case $(hostname -f) in
dlogin0[1-9].dogwood.wcoss2.ncep.noaa.gov) MACHINE_ID=wcoss2 ;; ### dogwood01-9
dlogin10.dogwood.wcoss2.ncep.noaa.gov) MACHINE_ID=wcoss2 ;; ### dogwood10

gaea5[1-8]) MACHINE_ID=gaea ;; ### gaea51-58
gaea5[1-8].ncrc.gov) MACHINE_ID=gaea ;; ### gaea51-58
gaea5[1-8]) MACHINE_ID=gaeac5 ;; ### gaea51-58
gaea5[1-8].ncrc.gov) MACHINE_ID=gaeac5 ;; ### gaea51-58

gaea6[1-8]) MACHINE_ID=gaeac6 ;; ### gaea61-68
gaea6[1-8].ncrc.gov) MACHINE_ID=gaeac6 ;; ### gaea61-68

hfe0[1-9]) MACHINE_ID=hera ;; ### hera01-09
hfe1[0-2]) MACHINE_ID=hera ;; ### hera10-12
Expand Down Expand Up @@ -61,7 +64,10 @@ if [[ "${MACHINE_ID}" != "UNKNOWN" ]]; then
fi

# Try searching based on paths since hostname may not match on compute nodes
if [[ -d /lfs/h3 ]]; then
if [[ -d /opt/spack-stack ]]; then
# We are in a container
MACHINE_ID=container
elif [[ -d /lfs/h3 ]]; then
# We are on NOAA Cactus or Dogwood
MACHINE_ID=wcoss2
elif [[ -d /lfs/h1 && ! -d /lfs/h3 ]]; then
Expand All @@ -81,9 +87,12 @@ elif [[ -d /work ]]; then
else
MACHINE_ID=orion
fi
elif [[ -d /gpfs && -d /ncrc ]]; then
# We are on GAEA.
MACHINE_ID=gaea
elif [[ -d /gpfs/f5 ]]; then
# We are on GAEAC5.
MACHINE_ID=gaeac5
elif [[ -d /gpfs/f6 ]]; then
# We are on GAEAC6.
MACHINE_ID=gaeac6
elif [[ -d /data/prod ]]; then
# We are on SSEC's S4
MACHINE_ID=s4
Expand Down
7 changes: 7 additions & 0 deletions ush/module-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ elif [[ $MACHINE_ID = orion* ]] ; then
fi
module purge

elif [[ $MACHINE_ID = container ]] ; then
# We are in a container
if ( ! eval module help > /dev/null 2>&1 ) ; then
source /usr/lmod/lmod/init/bash
fi
module purge

elif [[ $MACHINE_ID = s4* ]] ; then
# We are on SSEC Wisconsin S4
if ( ! eval module help > /dev/null 2>&1 ) ; then
Expand Down
7 changes: 4 additions & 3 deletions ush/sub_gaea → ush/sub_gaeac5
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ output=${output:-$jobname.out}
myuser=$LOGNAME
myhost=$(hostname)

if [ -d /gpfs/f5/epic/scratch/${USER}/$LOGNAME ]; then
DATA=/gpfs/f5/epic/scratch/${USER}/$LOGNAME/tmp
if [ -d /gpfs/f5/ufs-ard/scratch/${USER}/$LOGNAME ]; then
DATA=/gpfs/f5/ufs-ard/scratch/${USER}/$LOGNAME/tmp
fi
DATA=${DATA:-$ptmp/tmp}

Expand Down Expand Up @@ -129,7 +129,7 @@ echo "" >>$cfile

echo "module reset" >> $cfile
echo "module use $modulefiles" >> $cfile
echo "module load gsi_gaea.intel" >> $cfile
echo "module load gsi_gaeac5.intel" >> $cfile
echo "module list" >> $cfile
echo "" >>$cfile

Expand Down Expand Up @@ -158,6 +158,7 @@ sbatch=${sbatch:-sbatch}
ofile=$DATA/subout$$
>$ofile
chmod 777 $ofile
export FI_VERBS_PREFER_XRC=0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this setting resolve what appears to be mpi_finalize problems on C5?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this setting resolve what appears to be mpi_finalize problems on C5?

It appears so. Here is the notice from Seth Underwood with Gaea C5: "After the C5 update, users reported that some jobs failed during the MPI_Finalize call. We have alerted ORNL and HPE. HPE has suggested setting the environment variable FI_VERBS_PREFER_XRC=0 in the run script (setenv FI_VERBS_PREFER_XRC 0, for csh; export FI_VERBS_PREFER_XRC=0). This has resolved the error in our tests. Please add this variable to your run script(s) if you also hit this error. Please note that we do not see any issues preemptively setting this environment variable."

Now that I think the MPI_Finalize issue is resolved, I am going to adjust the resources and test a little more. I'll let you know when I have my final changes in place for you to look over.

$sbatch $cfile >$ofile
rc=$?
cat $ofile
Expand Down
Loading
Loading