Merge branch 'master' into update-mvk-config

ESMCI · Oct 10, 2024 · a5cfc65 · a5cfc65
2 parents 0892b4a + 62a9b17
commit a5cfc65
Show file tree

Hide file tree

Showing 16 changed files with 181 additions and 218 deletions.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -39,28 +39,28 @@ jobs:
       packages: write
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
       - name: Login to DockerHub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
       - name: Docker meta
         id: meta
-        uses: docker/metadata-action@v4
+        uses: docker/metadata-action@v5
         with:
           images: ghcr.io/ESMCI/cime
           tags: |
             type=raw,value=latest,enable=${{ github.event_name == 'push' }}
             type=ref,event=pr,enable=${{ github.event_name == 'pull_request' }}
             type=sha,format=long
       - name: Build and push
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v6
         with:
           target: base
           context: docker/
@@ -76,7 +76,7 @@ jobs:
     timeout-minutes: 2
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Set up python
         uses: actions/setup-python@v2
         with:
@@ -102,7 +102,7 @@ jobs:
         python-version: ['3.8', '3.9', '3.10']
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Run tests
         shell: bash
         env:
@@ -151,7 +151,7 @@ jobs:
             driver: "mct"
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Cache inputdata
         uses: actions/cache@v2
         with:
@@ -178,6 +178,14 @@ jobs:
 
           conda activate base
 
+          # container libnetcdf is 4.9.2 as cesm requires esmf >8.6.1
+          # e3sm scorpio incompatible with 4.9.2, downgrade to 4.9.1
+          # only reference found about scorpio incompatibility with 4.9.2 (https://github.com/E3SM-Project/scorpio/issues/554#issuecomment-1877361470)
+          # TODO open scorpio issue, possible solutions; 1. support two conda environments in container 2. maybe move from conda to spack? build all libraries in image
+          if [[ "${CIME_MODEL}" == "e3sm" ]]; then
+            mamba install -y 'libnetcdf=4.9.1'
+          fi
+
           pytest -vvv --cov=CIME --machine docker --no-fortran-run --no-teardown CIME/tests/test_sys*
       - uses: mxschmitt/action-tmate@v3
         if: ${{ !always() }}

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
@@ -41,9 +41,6 @@ def get_value(
         attribute=None,
         resolved=True,
         subgroup=None,
-        max_mpitasks_per_node=None,
-        max_cputasks_per_gpu_node=None,
-        ngpus_per_node=None,
     ):  # pylint: disable=arguments-differ
         # Special variable NINST_MAX is used to determine the number of
         # drivers in multi-driver mode.
@@ -58,12 +55,9 @@ def get_value(
         value = EnvBase.get_value(self, vid, attribute, resolved, subgroup)
 
         if "NTASKS" in vid or "ROOTPE" in vid:
-            if max_mpitasks_per_node is None:
-                max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
-            if max_cputasks_per_gpu_node is None:
-                max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
-            if ngpus_per_node is None:
-                ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
+            max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            ngpus_per_node = self.get_value("NGPUS_PER_NODE")
             if (ngpus_per_node and value) and value < 0:
                 value = -1 * value * max_cputasks_per_gpu_node
             elif value and value < 0:
@@ -176,18 +170,29 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
             "totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
         )
         if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
-            if self.get_value("NGPUS_PER_NODE") > 0:
-                tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            if ngpus_per_node and ngpus_per_node > 0:
+                if self.get_value("OVERSUBSCRIBE_GPU"):
+                    tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+                else:
+                    tasks_per_node = self.get_value("NGPUS_PER_NODE")
             else:
                 tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
         else:
             ngpus_per_node = self.get_value("NGPUS_PER_NODE")
             if ngpus_per_node and ngpus_per_node > 0:
-                tasks_per_node = min(
-                    self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
-                    self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
-                    total_tasks,
-                )
+                if self.get_value("OVERSUBSCRIBE_GPU"):
+                    tasks_per_node = min(
+                        self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                        self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
+                        total_tasks,
+                    )
+                else:
+                    tasks_per_node = min(
+                        self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                        self.get_value("NGPUS_PER_NODE"),
+                        total_tasks,
+                    )
             else:
                 tasks_per_node = min(
                     self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
@@ -204,7 +209,14 @@ def get_total_nodes(self, total_tasks, max_thread_count):
         if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
             max_thread_count = 1
         tasks_per_node = self.get_tasks_per_node(total_tasks, max_thread_count)
-        num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
+        if self.get_value("OVERSUBSCRIBE_GPU"):
+            num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
+        else:
+            ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            if ngpus_per_node and ngpus_per_node > 0:
+                num_nodes = int(math.ceil(float(total_tasks) / ngpus_per_node))
+            else:
+                num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
         return num_nodes, self.get_spare_nodes(num_nodes)
 
     def get_spare_nodes(self, num_nodes):

diff --git a/CIME/build.py b/CIME/build.py
@@ -247,18 +247,10 @@ def get_standard_cmake_args(case, sharedpath):
     )
     # check settings for GPU
     gpu_type = case.get_value("GPU_TYPE")
-    gpu_offload = case.get_value("GPU_OFFLOAD")
-    if gpu_type != "none":
-        expect(
-            gpu_offload != "none",
-            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
-        )
-        cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
-    else:
-        expect(
-            gpu_offload == "none",
-            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
-        )
+    openacc_gpu_offload = case.get_value("OPENACC_GPU_OFFLOAD")
+    openmp_gpu_offload = case.get_value("OPENMP_GPU_OFFLOAD")
+    kokkos_gpu_offload = case.get_value("KOKKOS_GPU_OFFLOAD")
+    cmake_args += f" -DGPU_TYPE={gpu_type} -DOPENACC_GPU_OFFLOAD={openacc_gpu_offload} -DOPENMP_GPU_OFFLOAD={openmp_gpu_offload} -DKOKKOS_GPU_OFFLOAD={kokkos_gpu_offload} "
 
     ocn_model = case.get_value("COMP_OCN")
     atm_dycore = case.get_value("CAM_DYCORE")

diff --git a/CIME/case/case.py b/CIME/case/case.py
@@ -1301,9 +1301,6 @@ def configure(
         non_local=False,
         extra_machines_dir=None,
         case_group=None,
-        ngpus_per_node=0,
-        gpu_type=None,
-        gpu_offload=None,
     ):
         expect(
             check_name(compset_name, additional_chars="."),
@@ -1561,64 +1558,6 @@ def configure(
         if test:
             self.set_value("TEST", True)
 
-        # ----------------------------------------------------------------------------------------------------------
-        # Sanity check for a GPU run:
-        #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
-        #        2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
-        #             XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
-        #        3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
-        # ----------------------------------------------------------------------------------------------------------
-        max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
-        if gpu_type and str(gpu_type).lower() != "none":
-            expect(
-                max_gpus_per_node,
-                f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
-            )
-            expect(
-                gpu_offload,
-                "Both gpu-type and gpu-offload must be defined if either is defined",
-            )
-            expect(
-                compiler in ["nvhpc", "cray"],
-                f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
-            )
-            valid_gpu_type = self.get_value("GPU_TYPE").split(",")
-            valid_gpu_type.remove("none")
-            expect(
-                gpu_type in valid_gpu_type,
-                f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
-            )
-            valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
-            valid_gpu_offload.remove("none")
-            expect(
-                gpu_offload in valid_gpu_offload,
-                f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
-            )
-            self.gpu_enabled = True
-            if ngpus_per_node >= 0:
-                self.set_value(
-                    "NGPUS_PER_NODE",
-                    max(1, ngpus_per_node)
-                    if ngpus_per_node <= max_gpus_per_node
-                    else max_gpus_per_node,
-                )
-        elif gpu_offload and str(gpu_offload).lower() != "none":
-            expect(
-                False,
-                "Both gpu-type and gpu-offload must be defined if either is defined",
-            )
-        elif ngpus_per_node != 0:
-            expect(
-                False,
-                f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
-            )
-
-        # Set these two GPU XML variables here to overwrite the default values
-        # Only set them for "cesm" model
-        if self._cime_model == "cesm":
-            self.set_value("GPU_TYPE", str(gpu_type).lower())
-            self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())
-
         self.initialize_derived_attributes()
 
         # --------------------------------------------
@@ -2440,9 +2379,6 @@ def create(
         non_local=False,
         extra_machines_dir=None,
         case_group=None,
-        ngpus_per_node=0,
-        gpu_type=None,
-        gpu_offload=None,
     ):
         try:
             # Set values for env_case.xml
@@ -2515,9 +2451,6 @@ def create(
                 non_local=non_local,
                 extra_machines_dir=extra_machines_dir,
                 case_group=case_group,
-                ngpus_per_node=ngpus_per_node,
-                gpu_type=gpu_type,
-                gpu_offload=gpu_offload,
             )
 
             self.create_caseroot()

diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
@@ -389,6 +389,48 @@ def _case_setup_impl(
                     + case.iotasks,
                 )
 
+            # ----------------------------------------------------------------------------------------------------------
+            # Sanity check for a GPU run:
+            #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUs
+            #        2. If the NGPUS_PER_NODE XML variable in the env_mach_pes.xml file is larger than
+            #           the value of MAX_GPUS_PER_NODE, set it to MAX_GPUS_PER_NODE automatically.
+            #        3. If the NGPUS_PER_NODE XML variable is equal to 0, it will be updated to 1 automatically.
+            # ----------------------------------------------------------------------------------------------------------
+            max_gpus_per_node = case.get_value("MAX_GPUS_PER_NODE")
+            gpu_type = case.get_value("GPU_TYPE")
+            openacc_gpu_offload = case.get_value("OPENACC_GPU_OFFLOAD")
+            openmp_gpu_offload = case.get_value("OPENMP_GPU_OFFLOAD")
+            kokkos_gpu_offload = case.get_value("KOKKOS_GPU_OFFLOAD")
+            gpu_offload = (
+                openacc_gpu_offload or openmp_gpu_offload or kokkos_gpu_offload
+            )
+            ngpus_per_node = case.get_value("NGPUS_PER_NODE")
+            if gpu_type and str(gpu_type).lower() != "none":
+                if max_gpus_per_node <= 0:
+                    raise RuntimeError(
+                        f"MAX_GPUS_PER_NODE must be larger than 0 for machine={mach} and compiler={compiler} in order to configure a GPU run"
+                    )
+                if not gpu_offload:
+                    raise RuntimeError(
+                        "GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled"
+                    )
+                case.gpu_enabled = True
+                if ngpus_per_node >= 0:
+                    case.set_value(
+                        "NGPUS_PER_NODE",
+                        max(1, ngpus_per_node)
+                        if ngpus_per_node <= max_gpus_per_node
+                        else max_gpus_per_node,
+                    )
+            elif gpu_offload:
+                raise RuntimeError(
+                    "GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled"
+                )
+            elif ngpus_per_node and ngpus_per_node != 0:
+                raise RuntimeError(
+                    f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;"
+                )
+
             # May need to select new batch settings if pelayout changed (e.g. problem is now too big for prev-selected queue)
             env_batch = case.get_env("batch")
             env_batch.set_job_defaults([(case.get_primary_job(), {})], case)
@@ -527,37 +569,26 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
 
 
 def _create_case_repo(self, caseroot):
-    version = run_cmd_no_fail("git --version")
-    result = re.findall(r"([0-9]+)\.([0-9]+)\.?[0-9]*", version)
-    major = int(result[0][0])
-    minor = int(result[0][1])
-
-    # gitinterface needs git version 2.28 or newer
-    if major > 2 or (major == 2 and minor >= 28):
-        self._gitinterface = GitInterface(
-            caseroot, logger, branch=self.get_value("CASE")
+    self._gitinterface = GitInterface(caseroot, logger, branch=self.get_value("CASE"))
+    if self._gitinterface and not os.path.exists(os.path.join(caseroot, ".gitignore")):
+        safe_copy(
+            os.path.join(
+                self.get_value("CIMEROOT"),
+                "CIME",
+                "data",
+                "templates",
+                "gitignore.template",
+            ),
+            os.path.join(caseroot, ".gitignore"),
+        )
+        append_case_status(
+            "", "", "local git repository created", gitinterface=self._gitinterface
         )
-        if not os.path.exists(os.path.join(caseroot, ".gitignore")):
-            safe_copy(
-                os.path.join(
-                    self.get_value("CIMEROOT"),
-                    "CIME",
-                    "data",
-                    "templates",
-                    "gitignore.template",
-                ),
-                os.path.join(caseroot, ".gitignore"),
-            )
-            append_case_status(
-                "", "", "local git repository created", gitinterface=self._gitinterface
-            )
         # add all files in caseroot to local repository
         self._gitinterface._git_command("add", "*")
-    else:
-        logger.warning("git interface requires git version 2.28 or newer")
-
+    elif not self._gitinterface:
         append_case_status(
             "",
             "",
-            f"local git version too old for cime git interface {major}.{minor}",
+            "Local git version too old for cime git interface, version 2.28 or newer required.",
         )