Merge pull request #15 from aws/release-1.1.0

Sagemaker Hyperpod Recipes Release 1.1.0
aws · Dec 31, 2024 · 193aaa8 · 193aaa8
2 parents 5f8b472 + 459133e
commit 193aaa8
Show file tree

Hide file tree

Showing 19 changed files with 1,506 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -33,8 +33,10 @@ List of specific pre-training recipes used by the launch scripts.
 | Hugging Face | Llama 3.2 | 3b   | 8192            | 1     | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.sh) |
 | Hugging Face | Llama 3.1 | 70b  | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh) |
 | Hugging Face | Llama 3.1 | 70b  | 16384           | 64    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x64_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh) |
+| Hugging Face | Llama 3.1 | 70b  | 16384           | 128    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x128_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x128_pretrain.sh) |
 | Hugging Face | Llama 3.1 | 70b  | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh) |
 | Hugging Face | Llama 3.1 | 70b  | 8192            | 64    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x64_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh) |
+| Hugging Face | Llama 3.1 | 70b  | 8192            | 128    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x128_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x128_pretrain.sh) |
 | Hugging Face | Llama 3   | 70b  | 8192            | 16    | ml.trn1.32xlarge | TRN         | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_trn1x16_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh) |
 | Hugging Face | Llama 3.1 | 8b   | 16384           | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh) |
 | Hugging Face | Llama 3.1 | 8b   | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh) |
@@ -48,8 +50,10 @@ List of specific pre-training recipes used by the launch scripts.
 | Hugging Face | Mistral   | 7b   | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh) |
 | Hugging Face | Mixtral   | 22b  | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh) |
 | Hugging Face | Mixtral   | 22b  | 16384           | 64    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh) |
+| Hugging Face | Mixtral   | 22b  | 16384           | 128    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.sh) |
 | Hugging Face | Mixtral   | 22b  | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh) |
 | Hugging Face | Mixtral   | 22b  | 8192            | 64    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh) |
+| Hugging Face | Mixtral   | 22b  | 8192            | 128    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.sh) |
 | Hugging Face | Mixtral   | 7b   | 16384           | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh) |
 | Hugging Face | Mixtral   | 7b   | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh) |
 | Hugging Face | Mixtral   | 7b   | 8192            | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh) |
@@ -64,10 +68,15 @@ All model sources are from Hugging Face.
 | Model     | Method | Size | Sequence length | Nodes | Instance       | Accelerator | Recipe | Script |
 | --------- | ------ | ---- | ----------------| ----- | -------------- | ----------- | ------ | ------ |
 | Llama 3.1 | QLoRA  | 405b | 131072          | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq128k_gpu_qlora.sh) |
+| Llama 3.1 | QLoRA  | 405b | 32768           | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq32k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq32k_gpu_qlora.sh) |
 | Llama 3.1 | LoRA   | 405b | 16384           | 6     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_lora.sh) |
 | Llama 3.1 | QLoRA  | 405b | 16384           | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_qlora.sh) |
 | Llama 3.1 | LoRA   | 405b | 8192            | 6     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_lora.sh) |
 | Llama 3.1 | QLoRA  | 405b | 8192            | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_qlora.sh) |
+| Llama 3.3 | SFT  	 | 70b  | 16384           | 16    | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_fine_tuning.sh) |
+| Llama 3.3 | LoRA   | 70b  | 16384           | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_lora.sh) |
+| Llama 3.3 | SFT    | 70b  | 8192            | 10    | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_fine_tuning.sh) |
+| Llama 3.3 | LoRA   | 70b  | 8192            | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_lora.sh) |
 | Llama 3.1 | SFT    | 70b  | 16384           | 16    | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh) |
 | Llama 3.1 | LoRA   | 70b  | 16384           | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_lora.sh) |
 | Llama 3.1 | SFT    | 70b  | 8192            | 10    | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh) |

diff --git a/launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_fine_tuning.sh b/launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_fine_tuning.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_fine_tuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-fine-tuning" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=16 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_lora.sh b/launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=2 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_fine_tuning.sh b/launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_fine_tuning.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_fine_tuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-fine-tuning" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=10 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_lora.sh b/launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_405b_seq32k_gpu_qlora.sh b/launcher_scripts/llama/run_hf_llama3_405b_seq32k_gpu_qlora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_405b_seq32k_gpu_qlora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-405b-qlora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=2 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x128.pretrain.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x128.pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x128_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-70b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x128_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x128_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x128_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-70b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x22b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x22b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \