Skip to content

Commit

Permalink
Merge branch 'ko3n1g/ci/improve-release-tests' into 'main'
Browse files Browse the repository at this point in the history
ci: Do not print logs for release tests

See merge request ADLR/megatron-lm!2654
  • Loading branch information
ko3n1g committed Feb 9, 2025
2 parents cd4a391 + 4d00edb commit 2481987
Show file tree
Hide file tree
Showing 8 changed files with 23 additions and 13 deletions.
4 changes: 2 additions & 2 deletions tests/functional_tests/shell_test_utils/_run_training.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,15 @@ MASTER_PORT=${MASTER_PORT:-6000}
NUM_NODES=${NUM_NODES:-${SLURM_NNODES}}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
NODE_RANK=${SLURM_NODEID:-${SLURM_NODEID}}
LAST_RANK=$((NUM_NODES * 8 - 1))
LAST_RANK=7
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
--node_rank $SLURM_NODEID
--log-dir $OUTPUT_PATH
--tee "0:3,$LAST_RANK:3"
--tee "0:3,7:3"
--redirects "3"
)

Expand Down
9 changes: 9 additions & 0 deletions tests/functional_tests/shell_test_utils/run_ci_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ for i in $(seq 1 $N_REPEAT); do
export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
export CHECKPOINT_LOAD_PATH=/tmp/checkpoints/

if [[ "$TEST_TYPE" = "release" ]]; then
export CHECKPOINT_LOAD_PATH=$_CHECKPOINT_LOAD_PATH
export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
fi

bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh

if [[ "$TEST_TYPE" = "frozen-resume" && -z "$(ls -A "$_CHECKPOINT_LOAD_PATH" 2>/dev/null)" ]]; then
Expand Down Expand Up @@ -115,6 +120,10 @@ for i in $(seq 1 $N_REPEAT); do
--output-path ${OUTPUT_PATH}/$(basename $GOLDEN_VALUES_PATH) \
"${EXTRACT_ARGS[@]}"

if [[ "$TEST_TYPE" == "release" ]]; then
SKIP_PYTEST=0
fi

# Maybe run tests
if [[ ${SKIP_PYTEST:-0} != 1 ]]; then
export NVTE_ALLOW_NONDETERMINISTIC_ALGO
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ MODEL_ARGS:
--log-interval: 100
--save-interval: 2000
--eval-interval: 1000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--eval-iters: 10
--tensorboard-dir: ${TENSORBOARD_PATH}
--log-timers-to-tensorboard: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ MODEL_ARGS:
--eval-iters: 32
--eval-interval: 2000
# Add checkpointing args
--load: ${OUTPUT_PATH}/checkpoints
--save: ${OUTPUT_PATH}/checkpoints
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--save-interval: 5000
# Add initialization args
--init-method-std: 0.0134
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ MODEL_ARGS:
--eval-iters: 32
--eval-interval: 200
# Add checkpointing args
--load: ${OUTPUT_PATH}/checkpoints
--save: ${OUTPUT_PATH}/checkpoints
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--save-interval: 5000
# Add initialization args
--init-method-std: 0.010
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ MODEL_ARGS:
--eval-iters: 32
--eval-interval: 200
# Add checkpointing args
--load: ${OUTPUT_PATH}/checkpoints
--save: ${OUTPUT_PATH}/checkpoints
--save: ${CHECKPOINT_LOAD_PATH}
--load: ${CHECKPOINT_SAVE_PATH}
--save-interval: 500
# Add initialization args
--init-method-std: 0.010
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ MODEL_ARGS:
--log-interval: 100
--save-interval: 2000
--eval-interval: 1000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--eval-iters: 10
--tensorboard-dir: ${TENSORBOARD_PATH}
--log-timers-to-tensorboard: true
Expand Down
3 changes: 2 additions & 1 deletion tests/test_utils/python_scripts/launch_jet_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,8 @@ def main(
n_attempts += 1
continue

print(f"Logs:\n{concat_logs}")
if test_type != "release":
print(f"Logs:\n{concat_logs}")

success = pipeline.get_status() == PipelineStatus.SUCCESS
logger.info("Pipeline terminated with status %s", pipeline.get_status().name)
Expand Down

0 comments on commit 2481987

Please sign in to comment.