Skip to content

Commit

Permalink
Merge branch 'master' into update-mvk-config
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonb5 committed Oct 10, 2024
2 parents 0892b4a + 62a9b17 commit a5cfc65
Show file tree
Hide file tree
Showing 16 changed files with 181 additions and 218 deletions.
26 changes: 17 additions & 9 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,28 +39,28 @@ jobs:
packages: write
steps:
- name: Checkout code
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
uses: docker/setup-buildx-action@v3
- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Docker meta
id: meta
uses: docker/metadata-action@v4
uses: docker/metadata-action@v5
with:
images: ghcr.io/ESMCI/cime
tags: |
type=raw,value=latest,enable=${{ github.event_name == 'push' }}
type=ref,event=pr,enable=${{ github.event_name == 'pull_request' }}
type=sha,format=long
- name: Build and push
uses: docker/build-push-action@v3
uses: docker/build-push-action@v6
with:
target: base
context: docker/
Expand All @@ -76,7 +76,7 @@ jobs:
timeout-minutes: 2
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: Set up python
uses: actions/setup-python@v2
with:
Expand All @@ -102,7 +102,7 @@ jobs:
python-version: ['3.8', '3.9', '3.10']
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: Run tests
shell: bash
env:
Expand Down Expand Up @@ -151,7 +151,7 @@ jobs:
driver: "mct"
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: Cache inputdata
uses: actions/cache@v2
with:
Expand All @@ -178,6 +178,14 @@ jobs:
conda activate base
# container libnetcdf is 4.9.2 as cesm requires esmf >8.6.1
# e3sm scorpio incompatible with 4.9.2, downgrade to 4.9.1
# only reference found about scorpio incompatibility with 4.9.2 (https://github.com/E3SM-Project/scorpio/issues/554#issuecomment-1877361470)
# TODO open scorpio issue, possible solutions; 1. support two conda environments in container 2. maybe move from conda to spack? build all libraries in image
if [[ "${CIME_MODEL}" == "e3sm" ]]; then
mamba install -y 'libnetcdf=4.9.1'
fi
pytest -vvv --cov=CIME --machine docker --no-fortran-run --no-teardown CIME/tests/test_sys*
- uses: mxschmitt/action-tmate@v3
if: ${{ !always() }}
Expand Down
46 changes: 29 additions & 17 deletions CIME/XML/env_mach_pes.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ def get_value(
attribute=None,
resolved=True,
subgroup=None,
max_mpitasks_per_node=None,
max_cputasks_per_gpu_node=None,
ngpus_per_node=None,
): # pylint: disable=arguments-differ
# Special variable NINST_MAX is used to determine the number of
# drivers in multi-driver mode.
Expand All @@ -58,12 +55,9 @@ def get_value(
value = EnvBase.get_value(self, vid, attribute, resolved, subgroup)

if "NTASKS" in vid or "ROOTPE" in vid:
if max_mpitasks_per_node is None:
max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
if max_cputasks_per_gpu_node is None:
max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
if ngpus_per_node is None:
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if (ngpus_per_node and value) and value < 0:
value = -1 * value * max_cputasks_per_gpu_node
elif value and value < 0:
Expand Down Expand Up @@ -176,18 +170,29 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
"totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
)
if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
if self.get_value("NGPUS_PER_NODE") > 0:
tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if ngpus_per_node and ngpus_per_node > 0:
if self.get_value("OVERSUBSCRIBE_GPU"):
tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
else:
tasks_per_node = self.get_value("NGPUS_PER_NODE")
else:
tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
else:
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if ngpus_per_node and ngpus_per_node > 0:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
total_tasks,
)
if self.get_value("OVERSUBSCRIBE_GPU"):
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
total_tasks,
)
else:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("NGPUS_PER_NODE"),
total_tasks,
)
else:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
Expand All @@ -204,7 +209,14 @@ def get_total_nodes(self, total_tasks, max_thread_count):
if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
max_thread_count = 1
tasks_per_node = self.get_tasks_per_node(total_tasks, max_thread_count)
num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
if self.get_value("OVERSUBSCRIBE_GPU"):
num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
else:
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if ngpus_per_node and ngpus_per_node > 0:
num_nodes = int(math.ceil(float(total_tasks) / ngpus_per_node))
else:
num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
return num_nodes, self.get_spare_nodes(num_nodes)

def get_spare_nodes(self, num_nodes):
Expand Down
16 changes: 4 additions & 12 deletions CIME/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,18 +247,10 @@ def get_standard_cmake_args(case, sharedpath):
)
# check settings for GPU
gpu_type = case.get_value("GPU_TYPE")
gpu_offload = case.get_value("GPU_OFFLOAD")
if gpu_type != "none":
expect(
gpu_offload != "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)
cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
else:
expect(
gpu_offload == "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)
openacc_gpu_offload = case.get_value("OPENACC_GPU_OFFLOAD")
openmp_gpu_offload = case.get_value("OPENMP_GPU_OFFLOAD")
kokkos_gpu_offload = case.get_value("KOKKOS_GPU_OFFLOAD")
cmake_args += f" -DGPU_TYPE={gpu_type} -DOPENACC_GPU_OFFLOAD={openacc_gpu_offload} -DOPENMP_GPU_OFFLOAD={openmp_gpu_offload} -DKOKKOS_GPU_OFFLOAD={kokkos_gpu_offload} "

ocn_model = case.get_value("COMP_OCN")
atm_dycore = case.get_value("CAM_DYCORE")
Expand Down
67 changes: 0 additions & 67 deletions CIME/case/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -1301,9 +1301,6 @@ def configure(
non_local=False,
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):
expect(
check_name(compset_name, additional_chars="."),
Expand Down Expand Up @@ -1561,64 +1558,6 @@ def configure(
if test:
self.set_value("TEST", True)

# ----------------------------------------------------------------------------------------------------------
# Sanity check for a GPU run:
# 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
# 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
# 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# ----------------------------------------------------------------------------------------------------------
max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
if gpu_type and str(gpu_type).lower() != "none":
expect(
max_gpus_per_node,
f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
)
expect(
gpu_offload,
"Both gpu-type and gpu-offload must be defined if either is defined",
)
expect(
compiler in ["nvhpc", "cray"],
f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
)
valid_gpu_type = self.get_value("GPU_TYPE").split(",")
valid_gpu_type.remove("none")
expect(
gpu_type in valid_gpu_type,
f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
)
valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
valid_gpu_offload.remove("none")
expect(
gpu_offload in valid_gpu_offload,
f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
)
self.gpu_enabled = True
if ngpus_per_node >= 0:
self.set_value(
"NGPUS_PER_NODE",
max(1, ngpus_per_node)
if ngpus_per_node <= max_gpus_per_node
else max_gpus_per_node,
)
elif gpu_offload and str(gpu_offload).lower() != "none":
expect(
False,
"Both gpu-type and gpu-offload must be defined if either is defined",
)
elif ngpus_per_node != 0:
expect(
False,
f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
)

# Set these two GPU XML variables here to overwrite the default values
# Only set them for "cesm" model
if self._cime_model == "cesm":
self.set_value("GPU_TYPE", str(gpu_type).lower())
self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())

self.initialize_derived_attributes()

# --------------------------------------------
Expand Down Expand Up @@ -2440,9 +2379,6 @@ def create(
non_local=False,
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):
try:
# Set values for env_case.xml
Expand Down Expand Up @@ -2515,9 +2451,6 @@ def create(
non_local=non_local,
extra_machines_dir=extra_machines_dir,
case_group=case_group,
ngpus_per_node=ngpus_per_node,
gpu_type=gpu_type,
gpu_offload=gpu_offload,
)

self.create_caseroot()
Expand Down
85 changes: 58 additions & 27 deletions CIME/case/case_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,48 @@ def _case_setup_impl(
+ case.iotasks,
)

# ----------------------------------------------------------------------------------------------------------
# Sanity check for a GPU run:
# 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUs
# 2. If the NGPUS_PER_NODE XML variable in the env_mach_pes.xml file is larger than
# the value of MAX_GPUS_PER_NODE, set it to MAX_GPUS_PER_NODE automatically.
# 3. If the NGPUS_PER_NODE XML variable is equal to 0, it will be updated to 1 automatically.
# ----------------------------------------------------------------------------------------------------------
max_gpus_per_node = case.get_value("MAX_GPUS_PER_NODE")
gpu_type = case.get_value("GPU_TYPE")
openacc_gpu_offload = case.get_value("OPENACC_GPU_OFFLOAD")
openmp_gpu_offload = case.get_value("OPENMP_GPU_OFFLOAD")
kokkos_gpu_offload = case.get_value("KOKKOS_GPU_OFFLOAD")
gpu_offload = (
openacc_gpu_offload or openmp_gpu_offload or kokkos_gpu_offload
)
ngpus_per_node = case.get_value("NGPUS_PER_NODE")
if gpu_type and str(gpu_type).lower() != "none":
if max_gpus_per_node <= 0:
raise RuntimeError(
f"MAX_GPUS_PER_NODE must be larger than 0 for machine={mach} and compiler={compiler} in order to configure a GPU run"
)
if not gpu_offload:
raise RuntimeError(
"GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled"
)
case.gpu_enabled = True
if ngpus_per_node >= 0:
case.set_value(
"NGPUS_PER_NODE",
max(1, ngpus_per_node)
if ngpus_per_node <= max_gpus_per_node
else max_gpus_per_node,
)
elif gpu_offload:
raise RuntimeError(
"GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled"
)
elif ngpus_per_node and ngpus_per_node != 0:
raise RuntimeError(
f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;"
)

# May need to select new batch settings if pelayout changed (e.g. problem is now too big for prev-selected queue)
env_batch = case.get_env("batch")
env_batch.set_job_defaults([(case.get_primary_job(), {})], case)
Expand Down Expand Up @@ -527,37 +569,26 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):


def _create_case_repo(self, caseroot):
version = run_cmd_no_fail("git --version")
result = re.findall(r"([0-9]+)\.([0-9]+)\.?[0-9]*", version)
major = int(result[0][0])
minor = int(result[0][1])

# gitinterface needs git version 2.28 or newer
if major > 2 or (major == 2 and minor >= 28):
self._gitinterface = GitInterface(
caseroot, logger, branch=self.get_value("CASE")
self._gitinterface = GitInterface(caseroot, logger, branch=self.get_value("CASE"))
if self._gitinterface and not os.path.exists(os.path.join(caseroot, ".gitignore")):
safe_copy(
os.path.join(
self.get_value("CIMEROOT"),
"CIME",
"data",
"templates",
"gitignore.template",
),
os.path.join(caseroot, ".gitignore"),
)
append_case_status(
"", "", "local git repository created", gitinterface=self._gitinterface
)
if not os.path.exists(os.path.join(caseroot, ".gitignore")):
safe_copy(
os.path.join(
self.get_value("CIMEROOT"),
"CIME",
"data",
"templates",
"gitignore.template",
),
os.path.join(caseroot, ".gitignore"),
)
append_case_status(
"", "", "local git repository created", gitinterface=self._gitinterface
)
# add all files in caseroot to local repository
self._gitinterface._git_command("add", "*")
else:
logger.warning("git interface requires git version 2.28 or newer")

elif not self._gitinterface:
append_case_status(
"",
"",
f"local git version too old for cime git interface {major}.{minor}",
"Local git version too old for cime git interface, version 2.28 or newer required.",
)
Loading

0 comments on commit a5cfc65

Please sign in to comment.