-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from aws/release-1.1.0
Sagemaker Hyperpod Recipes Release 1.1.0
- Loading branch information
Showing
19 changed files
with
1,506 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
28 changes: 28 additions & 0 deletions
28
launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_fine_tuning.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path | ||
HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ | ||
recipes=fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_fine_tuning \ | ||
base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ | ||
recipes.run.name="hf-llama3-70b-fine-tuning" \ | ||
recipes.exp_manager.exp_dir="$EXP_DIR" \ | ||
recipes.trainer.num_nodes=16 \ | ||
recipes.model.train_batch_size=1 \ | ||
recipes.model.data.train_dir="$TRAIN_DIR" \ | ||
recipes.model.data.val_dir="$VAL_DIR" \ | ||
recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ | ||
recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ |
28 changes: 28 additions & 0 deletions
28
launcher_scripts/llama/run_hf_llama3_3_70b_seq16k_gpu_lora.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path | ||
HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ | ||
recipes=fine-tuning/llama/hf_llama3_3_70b_seq16k_gpu_lora \ | ||
base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ | ||
recipes.run.name="hf-llama3-70b-lora" \ | ||
recipes.exp_manager.exp_dir="$EXP_DIR" \ | ||
recipes.trainer.num_nodes=2 \ | ||
recipes.model.train_batch_size=1 \ | ||
recipes.model.data.train_dir="$TRAIN_DIR" \ | ||
recipes.model.data.val_dir="$VAL_DIR" \ | ||
recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ | ||
recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ |
28 changes: 28 additions & 0 deletions
28
launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_fine_tuning.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path | ||
HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ | ||
recipes=fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_fine_tuning \ | ||
base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ | ||
recipes.run.name="hf-llama3-70b-fine-tuning" \ | ||
recipes.exp_manager.exp_dir="$EXP_DIR" \ | ||
recipes.trainer.num_nodes=10 \ | ||
recipes.model.train_batch_size=1 \ | ||
recipes.model.data.train_dir="$TRAIN_DIR" \ | ||
recipes.model.data.val_dir="$VAL_DIR" \ | ||
recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ | ||
recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ |
28 changes: 28 additions & 0 deletions
28
launcher_scripts/llama/run_hf_llama3_3_70b_seq8k_gpu_lora.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path | ||
HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ | ||
recipes=fine-tuning/llama/hf_llama3_3_70b_seq8k_gpu_lora \ | ||
base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ | ||
recipes.run.name="hf-llama3-70b-lora" \ | ||
recipes.exp_manager.exp_dir="$EXP_DIR" \ | ||
recipes.trainer.num_nodes=1 \ | ||
recipes.model.train_batch_size=1 \ | ||
recipes.model.data.train_dir="$TRAIN_DIR" \ | ||
recipes.model.data.val_dir="$VAL_DIR" \ | ||
recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ | ||
recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ |
28 changes: 28 additions & 0 deletions
28
launcher_scripts/llama/run_hf_llama3_405b_seq32k_gpu_qlora.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path | ||
HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ | ||
recipes=fine-tuning/llama/hf_llama3_405b_seq32k_gpu_qlora \ | ||
base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ | ||
recipes.run.name="hf-llama3-405b-qlora" \ | ||
recipes.exp_manager.exp_dir="$EXP_DIR" \ | ||
recipes.trainer.num_nodes=2 \ | ||
recipes.model.train_batch_size=1 \ | ||
recipes.model.data.train_dir="$TRAIN_DIR" \ | ||
recipes.model.data.val_dir="$VAL_DIR" \ | ||
recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \ | ||
recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \ |
21 changes: 21 additions & 0 deletions
21
launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x128.pretrain.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ | ||
recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x128_pretrain \ | ||
base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ | ||
recipes.run.name="hf-llama3-70b" \ | ||
recipes.exp_manager.exp_dir=$EXP_DIR \ | ||
recipes.model.data.train_dir=$TRAIN_DIR \ | ||
recipes.model.data.val_dir=$VAL_DIR \ |
21 changes: 21 additions & 0 deletions
21
launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x128_pretrain.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ | ||
recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x128_pretrain \ | ||
base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ | ||
recipes.run.name="hf-llama3-70b" \ | ||
recipes.exp_manager.exp_dir=$EXP_DIR \ | ||
recipes.model.data.train_dir=$TRAIN_DIR \ | ||
recipes.model.data.val_dir=$VAL_DIR \ |
21 changes: 21 additions & 0 deletions
21
launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ | ||
recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x128_pretrain \ | ||
base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ | ||
recipes.run.name="hf-mixtral-8x22b" \ | ||
recipes.exp_manager.exp_dir=$EXP_DIR \ | ||
recipes.model.data.train_dir=$TRAIN_DIR \ | ||
recipes.model.data.val_dir=$VAL_DIR \ |
21 changes: 21 additions & 0 deletions
21
launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
|
||
# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com | ||
|
||
#Users should setup their cluster type in /recipes_collection/config.yaml | ||
|
||
SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"} | ||
|
||
TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset | ||
VAL_DIR="${VAL_DIR}" # Location of validation dataset | ||
|
||
EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc. | ||
|
||
|
||
HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \ | ||
recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x128_pretrain \ | ||
base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \ | ||
recipes.run.name="hf-mixtral-8x22b" \ | ||
recipes.exp_manager.exp_dir=$EXP_DIR \ | ||
recipes.model.data.train_dir=$TRAIN_DIR \ | ||
recipes.model.data.val_dir=$VAL_DIR \ |
Oops, something went wrong.