Skip to content

Commit

Permalink
Merge pull request #14 from jessech-en/main
Browse files Browse the repository at this point in the history
Add support for more hyperpod instance types + Add transformers upgrade to enroot slurm code path
  • Loading branch information
jessech-en authored Dec 24, 2024
2 parents 5c66df4 + be7a466 commit 5f8b472
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 4 deletions.
42 changes: 42 additions & 0 deletions launcher/accelerator_devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@
"p4d.24xlarge": 8,
"p4de.24xlarge": 8,
"p5.48xlarge": 8,
"p5e.48xlarge": 8,
"p5en.48xlarge": 8,
"trn1.2xlarge": 1,
"trn1.32xlarge": 16,
"trn1n.32xlarge": 16,
"trn2.48xlarge": 16,
"g5.xlarge": 1,
"g5.2xlarge": 1,
"g5.4xlarge": 1,
Expand All @@ -26,15 +29,36 @@
"g5.16xlarge": 1,
"g5.24xlarge": 4,
"g5.48xlarge": 8,
"g6.xlarge": 1,
"g6.2xlarge": 1,
"g6.4xlarge": 1,
"g6.8xlarge": 1,
"g6.16xlarge": 1,
"g6.12xlarge": 4,
"g6.24xlarge": 4,
"g6.48xlarge": 8,
"gr6.4xlarge": 1,
"gr6.8xlarge": 1,
"g6e.xlarge": 1,
"g6e.2xlarge": 1,
"g6e.4xlarge": 1,
"g6e.8xlarge": 1,
"g6e.16xlarge": 1,
"g6e.12xlarge": 4,
"g6e.24xlarge": 4,
"g6e.48xlarge": 8,
}

coresPerAcceleratorDevice = {
"p4d.24xlarge": 1,
"p4de.24xlarge": 1,
"p5.48xlarge": 1,
"p5e.48xlarge": 1,
"p5en.48xlarge": 1,
"trn1.2xlarge": 2,
"trn1.32xlarge": 2,
"trn1n.32xlarge": 2,
"trn2.48xlarge": 2,
"g5.xlarge": 1,
"g5.2xlarge": 1,
"g5.4xlarge": 1,
Expand All @@ -43,6 +67,24 @@
"g5.16xlarge": 1,
"g5.24xlarge": 1,
"g5.48xlarge": 1,
"g6.xlarge": 1,
"g6.2xlarge": 1,
"g6.4xlarge": 1,
"g6.8xlarge": 1,
"g6.16xlarge": 1,
"g6.12xlarge": 1,
"g6.24xlarge": 1,
"g6.48xlarge": 1,
"gr6.4xlarge": 1,
"gr6.8xlarge": 1,
"g6e.xlarge": 1,
"g6e.2xlarge": 1,
"g6e.4xlarge": 1,
"g6e.8xlarge": 1,
"g6e.16xlarge": 1,
"g6e.12xlarge": 1,
"g6e.24xlarge": 1,
"g6e.48xlarge": 1,
}


Expand Down
26 changes: 25 additions & 1 deletion launcher/efa.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@
"g6.24xlarge",
"g6.48xlarge",
"g6.8xlarge",
"g6e.8xlarge",
"g6e.16xlarge",
"g6e.12xlarge",
"g6e.24xlarge",
"g6e.48xlarge",
"gr6.8xlarge",
"i3en.12xlarge",
"i3en.24xlarge",
Expand Down Expand Up @@ -88,6 +93,8 @@
"p4d.24xlarge",
"p4de.24xlarge",
"p5.48xlarge",
"p5e.48xlarge",
"p5en.48xlarge",
"r5dn.24xlarge",
"r5dn.metal",
"r5n.24xlarge",
Expand Down Expand Up @@ -118,6 +125,7 @@
"r8g.metal-48xl",
"trn1.32xlarge",
"trn1n.32xlarge",
"trn2.48xlarge",
"u7i-12tb.224xlarge",
"u7in-16tb.224xlarge",
"u7in-24tb.224xlarge",
Expand All @@ -138,10 +146,26 @@
"p5.4xlarge": 4,
"p5.24xlarge": 16,
"p5.48xlarge": 32,
"p5e.48xlarge": 32,
"p5en.48xlarge": 16,
"trn1.32xlarge": 8,
"trn1n.32xlarge": 16,
"trn2.48xlarge": 16,
"g6e.24xlarge": 2,
"g6e.48xlarge": 4,
}

instanceWithRDMASupport = set(
["p4d.24xlarge", "p4de.24xlarge", "p5.4xlarge", "p5.24xlarge", "p5.48xlarge", "trn1.32xlarge", "trn1n.32xlarge"]
[
"p4d.24xlarge",
"p4de.24xlarge",
"p5.4xlarge",
"p5.24xlarge",
"p5.48xlarge",
"p5e.48xlarge",
"p5en.48xlarge",
"trn1.32xlarge",
"trn1n.32xlarge",
"trn2.48xlarge",
]
)
12 changes: 9 additions & 3 deletions launcher/nemo/stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def _make_launch_docker_container_text(self):
if self.cfg.cluster.slurm_docker_cfg.get("post_launch_commands", None) is not None:
for cmd in self.cfg.cluster.slurm_docker_cfg.post_launch_commands:
post_launch_commands.append(cmd)
if self.cfg.recipes.get("model", None) and self.cfg.recipes.model.get("multi_modal", False):
if OmegaConf.select(self.cfg, "recipes.model.multi_modal", default=False):
transformers_upgrade_cmd = "pip install transformers==4.45.2"
post_launch_commands.append(transformers_upgrade_cmd)

Expand Down Expand Up @@ -404,6 +404,12 @@ def _make_train_script_text(self, stage_cfg_path=None, port=41000) -> str:
script_text.append("")
script_text.append("unset SLURM_NTASKS")

if get_container_type(self.cfg.get("container", None)) == "enroot" and self.cluster == "bcm":
if OmegaConf.select(self.cfg, "recipes.model.multi_modal", default=False):
transformers_upgrade_cmd = "pip install transformers==4.45.2"
script_text.append("")
script_text.append(transformers_upgrade_cmd)

script_text.append("")
script_text.append(self._make_custom_call_string(stage_cfg_path))
return "\n".join(script_text)
Expand Down Expand Up @@ -460,7 +466,7 @@ def create_sm_jobs_script(self, job_folder):
shutil.copy(script_src, script_dst)
# FIXME: Remove transformers requirement when container is updated to include the version
# required to run multi-modal.
if self.cfg.recipes.get("model", None) and self.cfg.recipes.model.get("multi_modal", False):
if OmegaConf.select(self.cfg, "recipes.model.multi_modal", default=False):
reqs_filename = Path(job_folder) / "requirements.txt"
with open(reqs_filename, "w") as reqs_file:
reqs_file.write(f"transformers=={TRANSFORMERS_VERSION_FOR_MULTIMODAL}")
Expand Down Expand Up @@ -726,7 +732,7 @@ def update_stage_specific_k8s_values(self, values_template):
else:
values_template.trainingConfig.scriptPath = str(self._entry_script_path)

if self.cfg.recipes.get("model", None) and self.cfg.recipes.model.get("multi_modal", False):
if OmegaConf.select(self.cfg, "recipes.model.multi_modal", default=False):
transformers_upgrade_cmd = "pip install transformers==4.45.2"
values_template.trainingConfig.pre_script.append(transformers_upgrade_cmd)

Expand Down

0 comments on commit 5f8b472

Please sign in to comment.