Add slurm system setup

mila-iqia · Sep 20, 2024 · b591c23 · b591c23
1 parent 3e45407
commit b591c23
Show file tree

Hide file tree

Showing 14 changed files with 669 additions and 125 deletions.
diff --git a/config/base.yaml b/config/base.yaml
@@ -28,6 +28,7 @@ _torchvision:
     --loader: pytorch
     --data: "{milabench_data}/FakeImageNet"
 
+
 _torchvision_ddp:
   inherits: _defaults
   definition: ../benchmarks/torchvision_ddp
@@ -113,6 +114,7 @@ _timm:
     --dataset: "FakeImageNet"
     --workers: "auto({n_worker}, 8)"
 
+
 _accelerate_opt:
   inherits: _defaults
   tags:
@@ -149,6 +151,7 @@ _accelerate_opt:
   use_deepspeed: true
   num_machines: 1
 
+
 fp16:
   inherits: _flops
 
@@ -388,6 +391,7 @@ brax:
     --num-minibatches: 32
     --num-envs: 8192
 
+
 _diffusion:
   inherits: _defaults
   definition: ../benchmarks/diffusion
@@ -530,11 +534,11 @@ _llm:
   definition: ../benchmarks/llm
   install_group: torch
 
+
 llm-lora-single:
   inherits: _llm
   plan:
     method: per_gpu
-
   argv:
     "{milabench_code}/recipes/lora_finetune_single_device.py": true
     --config: "{milabench_code}/configs/llama3_8B_lora_single_device.yaml"
@@ -596,6 +600,7 @@ llm-lora-ddp-nodes:
   requires_capabilities:
     - "len(nodes) >= ${num_machines}"
 
+
 llm-lora-mp-gpus:
   inherits: _llm
   plan:

diff --git a/config/cloud-multinodes-system.yaml b/config/cloud-multinodes-system.yaml
@@ -38,3 +38,16 @@ system:
       size: Standard_NV72ads_A10_v5
       location: eastus2
       disk_size: 512
+    slurm__a100_x2:
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        gpus-per-task: a100l:2
+        cpus-per-task: 12
+        time: "3:0:0"
+        mem: 64000
+        partition: short-unkillable
+        nodelist: cn-g[001-029]
diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml
@@ -38,3 +38,27 @@ system:
       size: Standard_NV72ads_A10_v5
       location: eastus2
       disk_size: 512
+    slurm__a100_x1:
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        gpus-per-task: a100l:1
+        cpus-per-task: 6
+        time: "3:0:0"
+        mem: 32000
+        partition: unkillable
+    slurm__a100_x4:
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        gpus-per-task: a100l:4
+        cpus-per-task: 24
+        time: "3:0:0"
+        mem: 128000
+        partition: short-unkillable
diff --git a/config/examples/cloud-multinodes-system.yaml b/config/examples/cloud-multinodes-system.yaml
@@ -35,3 +35,13 @@ system:
       volume_size: 8
       region: us-east-2
       state_id: 71669879043a3864225aabb94f91a2d4
+    slurm:
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        cpus-per-task: 1
+        time: "0:30:0"
+        mem: 1000
diff --git a/config/examples/cloud-system.yaml b/config/examples/cloud-system.yaml
@@ -28,3 +28,14 @@ system:
       instance_type: t2.micro
       volume_size: 8
       region: us-east-2
+    slurm:
+      # covalent-slurm-plugin args
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        cpus-per-task: 1
+        time: "0:30:0"
+        mem: 1000
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -102,7 +102,7 @@ Create a cloud system configuration
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Add a ``cloud_profiles`` section to the ``system`` configuration which lists the
-supported cloud profiles.
+supported cloud and slurm profiles.
 
 .. notes::
 
@@ -150,14 +150,95 @@ Run milabench on the cloud
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 1. | Initialize the cloud instances
-   | ``milabench cloud --system {{SYSTEM_CONFIG.YAML}} --setup --run-on {{PROFILE}} >{{SYSTEM_CLOUD_CONFIG.YAML}}``
+   | ``milabench cloud --setup --system {{SYSTEM_CONFIG.YAML}} --run-on {{PROFILE}} >{{SYSTEM_CLOUD_CONFIG.YAML}}``
 
 2. | Prepare, install and run milabench
    | ``milabench [prepare|install|run] --system {{SYSTEM_CLOUD_CONFIG.YAML}}``
 
 3. | Destroy the cloud instances
-   | ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PROFILE}}``
+   | ``milabench cloud --teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PROFILE}}``
    | or
-   | ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PLATFORM}} --all``
+   | ``milabench cloud --teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PLATFORM}} --all``
    | to destroy not just a single cloud instance but all instances on a
    specified platform that were instanced from the current local machine
+
+
+Use milabench on slurm
+~~~~~~~~~~~~~~~~~~~~~~
+
+
+Create a slurm system configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Add a ``cloud_profiles`` section to the ``system`` configuration which lists the
+supported cloud and slurm profiles.
+
+.. notes::
+
+  Nodes that should be created on the cloud should have the ``1.1.1.1`` ip
+  address placeholder. Other ip addresses will be used as-is and no cloud
+  instance will be created for that node
+
+.. notes::
+
+  A cloud profile entry needs to start with a covalent plugin (e.g. `slurm`). To
+  define multiple profiles on the same cloud platform, use the form
+  ``{PLATFORM}__{PROFILE_NAME}`` (e.g. ``slurm__profile``). All cloud profile
+  attributes will be used as is as argument for the target covalent plugin
+
+.. code-block:: yaml
+
+  system:
+    nodes:
+      - name: manager
+        # Use 1.1.1.1 as an ip placeholder
+        ip: 1.1.1.1
+        main: true
+        user: <username>
+      - name: node1
+        ip: 1.1.1.1
+        main: false
+        user: <username>
+  
+    # Cloud instances profiles
+    cloud_profiles:
+      # The cloud platform to use in the form of {PLATFORM} or
+      # {PLATFORM}__{PROFILE_NAME}
+      slurm:
+        username: usename
+        address: localhost
+        ssh_key_file: ssh_key_file
+        # bashrc_path will be replaced by the content of
+        # milabench/scripts/covalent/covalent_bashrc.sh
+        bashrc_path: "{bashrc_path}"
+        # job_uuid will be replaced by the generated job's uuid
+        remote_workdir: "cov-{job_uuid}-workdir"
+        use_srun: null
+        options:
+          ntasks-per-node: 1
+          cpus-per-task: 1
+          time: "0:30:0"
+          mem: 1000
+
+
+Run milabench on slurm
+^^^^^^^^^^^^^^^^^^^^^^
+
+1. | Initialize the slurm instances
+   | ``milabench cloud --setup --system {{SYSTEM_CONFIG.YAML}} --run-on {{PROFILE}} >{{SYSTEM_SLURM_CONFIG.YAML}}``
+
+2. | Prepare, install and run milabench
+   | ``milabench [prepare|install|run] --system {{SYSTEM_SLURM_CONFIG.YAML}}``
+
+3. | Destroy the slurm instances
+   | ``milabench cloud --teardown --system {{SYSTEM_SLURM_CONFIG.YAML}} --run-on {{PROFILE}}``
+
+.. notes::
+
+  Because the milabench's path is expected to be the same on local machine and
+  the remote machine, it's currently necessary to run the commands from the
+  slurm cluster. As the ``milabench cloud --[setup|teardown]`` commands requires
+  a covalent server to run and to avoid overloading the login nodes resources,
+  it's preferable to request a cpu compute node which will host to the covalent
+  server. An allocation with minimal resources like ``--nodes 1 --cpus-per-task
+  1 --mem 1000`` should be enough.
diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
@@ -60,29 +60,33 @@ def manage_cloud(pack, run_on, action="setup"):
         "private_ip":(lambda v: ("internal_ip",v)),
         "username":(lambda v: ("user",v)),
         "ssh_key_file":(lambda v: ("key",v)),
-        # "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])),
+        "env":(lambda v: ("env",[".", v, "milabench", "&&"])),
+        "slurm_job_id":(lambda v: ("slurm_job_id",v)),
     }
-    plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on])
+    plan_params = pack.config["system"]["cloud_profiles"][run_on]
     run_on, *profile = run_on.split("__")
     profile = profile[0] if profile else ""
     default_state_prefix = profile or run_on
     default_state_id = "_".join((pack.config["hash"][:6], blabla()))
 
-    local_base = pack.dirs.base.absolute()
-    local_data_dir = _get_common_dir(ROOT_FOLDER.parent, local_base.parent)
-    if local_data_dir is None:
-        local_data_dir = local_base.parent
-    remote_data_dir = XPath("/data") / local_data_dir.name
+    plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix)
+    plan_params["state_id"] = plan_params.get("state_id", default_state_id)
+    plan_params["keep_alive"] = None
+
+    # local_base = pack.dirs.base.absolute()
+    # local_data_dir = _get_common_dir(ROOT_FOLDER.parent, local_base.parent)
+    # if local_data_dir is None:
+    #     local_data_dir = local_base.parent
+    # remote_data_dir = XPath("/data") / local_data_dir.name
+
+    plan_params_copy = deepcopy(plan_params)
 
     nodes = iter(enumerate(pack.config["system"]["nodes"]))
     for i, n in nodes:
-        if n["ip"] != "1.1.1.1":
+        if n["ip"] != "1.1.1.1" and action == _SETUP:
             continue
 
-        plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix)
-        plan_params["state_id"] = plan_params.get("state_id", default_state_id)
-        plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)
-        plan_params["keep_alive"] = None
+        plan_params_copy["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)
 
         import milabench.scripts.covalent as cv
 
@@ -101,17 +105,17 @@ def manage_cloud(pack, run_on, action="setup"):
             "-m", cv.__name__,
             run_on,
             f"--{action}",
-            *_flatten_cli_args(**plan_params)
+            *_flatten_cli_args(**plan_params_copy)
         ]
-        if action == _SETUP:
-            cmd += [
-                "--",
-                "bash", "-c",
-                _or_sudo(f"mkdir -p '{local_data_dir.parent}'") +
-                " && " + _or_sudo(f"chmod a+rwX '{local_data_dir.parent}'") +
-                f" && mkdir -p '{remote_data_dir}'"
-                f" && ln -sfT '{remote_data_dir}' '{local_data_dir}'"
-            ]
+        # if action == _SETUP:
+        #     cmd += [
+        #         "--",
+        #         "bash", "-c",
+        #         _or_sudo(f"mkdir -p '{local_data_dir.parent}'") +
+        #         " && " + _or_sudo(f"chmod a+rwX '{local_data_dir.parent}'") +
+        #         f" && mkdir -p '{remote_data_dir}'"
+        #         f" && ln -sfT '{remote_data_dir}' '{local_data_dir}'"
+        #     ]
         p = subprocess.Popen(
             cmd,
             stdout=subprocess.PIPE,
@@ -155,6 +159,9 @@ def manage_cloud(pack, run_on, action="setup"):
                 stderr
             )
 
+        if action == _TEARDOWN:
+            break
+
     return pack.config["system"]
 
 

diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
@@ -450,6 +450,11 @@ def _find_node_config(self) -> Dict:
                 return n
         return {}
 
+    def _load_env(self, node):
+        if node.get("env", None):
+            return node["env"]
+        return []
+
     def is_local(self):
         localnode = self.pack.config["system"]["self"]
 
@@ -484,7 +489,7 @@ def _argv(self, **kwargs) -> List:
         argv.append(f"-p{self.port}")
         argv.append(host)
 
-        return argv # + ["env", "-i"]
+        return argv + self._load_env(node)
 
 
 class SCPCommand(SSHCommand, CmdCommand):
@@ -505,6 +510,10 @@ def __init__(
         self.src = src
         self.dest = dest if dest is not None else self.src
 
+    def _load_env(self, node):
+        del node
+        return []
+
     def _argv(self, **kwargs) -> List:
         argv = super()._argv(**kwargs)