diff --git a/.flake8.other b/.flake8.other
new file mode 100644
index 000000000000..5582f3331a3c
--- /dev/null
+++ b/.flake8.other
@@ -0,0 +1,8 @@
+[flake8]
+select = 
+    F541, # f-string without any placeholders
+    F841, # local variable 'x' is assigned to but never used
+    F401, # 'x' imported but unused
+    E741, # ambiguous variable name 'l'
+    F821, # undefined name 'x'
+    E266, # too many leading '#' for block comment
\ No newline at end of file
diff --git a/.flake8.speech b/.flake8.speech
new file mode 100644
index 000000000000..5582f3331a3c
--- /dev/null
+++ b/.flake8.speech
@@ -0,0 +1,8 @@
+[flake8]
+select = 
+    F541, # f-string without any placeholders
+    F841, # local variable 'x' is assigned to but never used
+    F401, # 'x' imported but unused
+    E741, # ambiguous variable name 'l'
+    F821, # undefined name 'x'
+    E266, # too many leading '#' for block comment
\ No newline at end of file
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 67743dc0cbc0..b8e975bbaa89 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,2 +1,4 @@
 .github/ @pablo-garay @ko3n1g @thomasdhc @chtruong814
 Dockerfile.ci @pablo-garay @ko3n1g @thomasdhc @chtruong814
+.pylintrc.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
+.flake8.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
\ No newline at end of file
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 25753d3792e1..1f14716d7f42 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -108,7 +108,8 @@ jobs:
             echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
 
             docker exec nemo_container_${{ github.run_id }} bash -c 'ls -al'
-            docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage 
+            docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml coverage.xml
+            docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage
 
             exit $EXIT_CODE
         
@@ -117,7 +118,9 @@ jobs:
           if: inputs.SAVE_COVERAGE_REPORT == true
           with:
             name: ${{ steps.main.outputs.coverage_report }}
-            path: ${{ github.run_id }}/.coverage
+            path: |
+              ${{ github.run_id }}/coverage.xml
+              ${{ github.run_id }}/.coverage
             include-hidden-files: true
             
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
diff --git a/.github/workflows/build-test-publish-wheel.yml b/.github/workflows/build-test-publish-wheel.yml
index da940a94b638..a32668e80d7c 100644
--- a/.github/workflows/build-test-publish-wheel.yml
+++ b/.github/workflows/build-test-publish-wheel.yml
@@ -26,17 +26,11 @@ defaults:
 
 jobs:
   build-test-publish-wheel:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.7.0
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.20.0
     with:
-      image-name: nemo_container
-      dockerfile: Dockerfile.ci
-      image-label: nemo-core
-      build-args: |
-        IMAGE_LABEL=nemo-core
-      prune-filter-timerange: 24h
       dry-run: true
       python-package: nemo
-      container-workdir: /workspace
+      python-version: '3.10'
       environment: public
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 7cecdd513ae4..3bbbb8990cb7 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -17,8 +17,11 @@ on:
     branches:
       - 'main'
       - 'r**'
+      - 'weekly-bump'
     types: [ labeled ]
-
+  push:
+    branches:
+      - main
   workflow_dispatch:
     inputs:
       test_to_run:
@@ -37,25 +40,48 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       test_to_run: ${{ steps.test_to_run.outputs.main }}
-      all: ${{ steps.all.outputs.main }}
-      event_name: ${{ steps.github-event.outputs.main }}
+    env:
+      TESTS_TO_RUN: ${{ inputs.test_to_run }}
+      EVENT_NAME: ${{ github.event_name }}
+      HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
     steps:
-      - name: Parse test_to_run
+      - name: Checkout branch
+        uses: actions/checkout@v4
+
+      - name: Select tests to run
         id: test_to_run
         run: |
-          parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")')
+          # For manual dispatch, we replace `all` with the actual job names          
+          if [[ "$EVENT_NAME" == "workflow_dispatch" && "$TESTS_TO_RUN" == "all" ]]; then
+            TESTS_TO_RUN=$(cat .github/workflows/cicd-main.yml | yq '.jobs | [to_entries[] | .key] | join(",")')
+          
+          # For manual dispatch with provided list of tests, do nothing
+          elif [[ "$EVENT_NAME" == "workflow_dispatch" && "$TESTS_TO_RUN" != "all" ]]; then
+            TESTS_TO_RUN=$TESTS_TO_RUN
+          
+          # For correctly labeled PR, we replace `all` with the actual job names          
+          elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
+              TESTS_TO_RUN=$(cat .github/workflows/cicd-main.yml | yq '.jobs | [to_entries[] | .key] | join(",")')
+
+          # For incorrectly labeled PR, run no tests
+          elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
+            TESTS_TO_RUN=""
+            
+          # For push events, run only unit tests. This is so that we can generate coverage
+          # on branch `main`.
+          elif [[ "$EVENT_NAME" == "push" ]]; then
+            TESTS_TO_RUN=$(cat .github/workflows/cicd-main.yml | yq '.jobs | [to_entries[]] | [.[] | select(.key == "L0_Unit*") | .key] | join(",")')
+
+          else
+            echo "Unsupported event_name $EVENT_NAME provided".
+            exit 1
+          fi
+
+          parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")')
           echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
-      - name: Parse all
-        id: all
-        run: |
-          echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT"
-      - name: Infer github event
-        id: github-event
-        run: |
-          echo "main=${{ github.event_name }}" | tee -a "$GITHUB_OUTPUT"
   
   cicd-test-container-build:
-    if: ${{ github.event.label.name == 'Run CICD' || needs.pre-flight.outputs.event_name == 'workflow_dispatch' }}
+    if: ${{ needs.pre-flight.outputs.test_to_run != '' }}
     uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.14.0
     needs: pre-flight
     with:
@@ -67,9 +93,9 @@ jobs:
       prune-filter-timerange: 24h
       
   cicd-import-tests:
+    if: ${{ needs.pre-flight.outputs.test_to_run != '' }}
     needs: [cicd-test-container-build, pre-flight]
     runs-on: self-hosted-azure-gpus-1
-    if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
     steps:
     - name: Run some checks
       run: |
@@ -89,111 +115,111 @@ jobs:
   L0_Unit_Tests_GPU_ASR:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR')
     with:
       RUNNER: self-hosted-azure-gpus-1
       TIMEOUT: 20
       # TODO: remove this hack
       SCRIPT: |
-        python -c "from nemo.collections.asr.models import ASRModel" && NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        python -c "from nemo.collections.asr.models import ASRModel" && NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_GPU_Audio:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Audio') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Audio')
     with:
       RUNNER: self-hosted-azure-gpus-1
       TIMEOUT: 20
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_GPU_Common:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Common') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Common')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_GPU_LLM:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_GPU_Multimodal:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Multimodal') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Multimodal')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_GPU_NLP:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_NLP') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_NLP')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_GPU_TTS:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_TTS') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_TTS')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   OPTIONAL_L0_Unit_Tests_GPU_Core:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Core') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Core')
     with:
       RUNNER: self-hosted-azure-gpus-1
       TIMEOUT: 20
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/core -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/core -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
       IS_OPTIONAL: true
 
   L0_Unit_Tests_GPU_Hydra:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Hydra') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Hydra')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_GPU_Lightning:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Lightning') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Lightning')
     with:
       RUNNER: self-hosted-azure
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads --cov-report=term --cov=nemo
+        NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_GPU_Others:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Others') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_GPU_Others')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SAVE_COVERAGE_REPORT: true
@@ -211,117 +237,117 @@ jobs:
         --ignore=tests/hydra \
         --ignore=tests/lightning \
         --ignore=tests/utils \
-        --cov-report=term --cov=nemo
+        --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   # L0: CPU unit tests
   L0_Unit_Tests_CPU_ASR:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR')
     with:
       RUNNER: self-hosted-azure-cpu
       TIMEOUT: 20
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_Audio:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Audio') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Audio')
     with:
       RUNNER: self-hosted-azure-cpu
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_Common:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common')
     with:
       RUNNER: self-hosted-azure-cpu
       TIMEOUT: 20
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_LLM:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_LLM') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_LLM')
     with:
       RUNNER: self-hosted-azure-cpu
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_Multimodal:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Multimodal') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Multimodal')
     with:
       RUNNER: self-hosted-azure-cpu
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_NLP:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP')
     with:
       RUNNER: self-hosted-azure-cpu
       TIMEOUT: 20
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_TTS:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_TTS') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_TTS')
     with:
       RUNNER: self-hosted-azure-cpu
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_Core:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Core') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Core')
     with:
       RUNNER: self-hosted-azure-cpu
       TIMEOUT: 20
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_Hydra:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Hydra') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Hydra')
     with:
       RUNNER: self-hosted-azure-cpu
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_Lightning:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Lightning') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Lightning')
     with:
       RUNNER: self-hosted-azure-cpu
       SAVE_COVERAGE_REPORT: true
       SCRIPT: |
-        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term --cov=nemo
+        CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_CPU_Others:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Others') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Unit_Tests_CPU_Others')
     with:
       RUNNER: self-hosted-azure-cpu
       SAVE_COVERAGE_REPORT: true
@@ -339,7 +365,7 @@ jobs:
         --ignore=tests/hydra \
         --ignore=tests/lightning \
         --ignore=tests/utils \
-        --cov-report=term --cov=nemo
+        --cov-report=term  --cov-branch --cov-report=xml --cov=nemo
 
   L0_Unit_Tests_Coverage:
     runs-on: self-hosted-azure-cpu
@@ -367,26 +393,95 @@ jobs:
       - L0_Unit_Tests_CPU_Lightning
       - L0_Unit_Tests_CPU_Others
     steps:
-      
-      - name: Download artifacts
+      - name: Download coverage reports of current branch
         uses: actions/download-artifact@v4
         with:
-          path: ${{ github.run_id }}
-        
-      - name: Combine
+          path: ${{ github.run_id }}/current
+
+      - name: Get total coverage of current branch
         shell: bash -x -e -u -o pipefail {0}
+        if: always()
+        id: total-current-branch
         run: |
-          pip install coverage
-          cd ${{ github.run_id }}
+          cd ${{ github.run_id }}/current
           ls -al .
           ls -al coverage-*/
           coverage combine --keep $(ls coverage-*/.coverage)
           coverage report
 
+          REPORT=$(coverage report)
+          TOTAL=$(echo "$REPORT" | grep "TOTAL" | awk '{print substr($NF, 1, length($NF)-1)}')
+          echo "main=${TOTAL}" | tee -a "$GITHUB_OUTPUT"
+      
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          directory: ${{ github.run_id }}/current
+          token: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: coverage-summary-${{ github.run_id }}
+          path: ${{ github.run_id }}/current/.coverage
+          include-hidden-files: true
+
+      # Enable this after we have a first successful run on `main`
+      # - name: Get last successful run of main
+      #   id: get_run_id
+      #   env:
+      #     GH_TOKEN: ${{ github.token }}
+      #   run: |
+      #     WORKFLOWS=$(curl -L \
+      #       -H "Accept: application/vnd.github+json" \
+      #       -H "Authorization: Bearer $GH_TOKEN" \
+      #       -H "X-GitHub-Api-Version: 2022-11-28" \
+      #       https://api.github.com/repos/NVIDIA/NeMo/actions/workflows)
+
+      #     WORKFLOW_ID=$(echo -E "$WORKFLOWS" | jq '.workflows[] | select(.path==".github/workflows/cicd-main.yml") | .id')
+
+      #     RUNS=$(curl -L \
+      #       -H "Accept: application/vnd.github+json" \
+      #       -H "Authorization: Bearer $GH_TOKEN" \
+      #       -H "X-GitHub-Api-Version: 2022-11-28" \
+      #       "https://api.github.com/repos/NVIDIA/NeMo/actions/workflows/$WORKFLOW_ID/runs?branch=main&status=success")
+
+      #     RUN_ID=$(echo -E "$RUNS" | jq '.workflow_runs[0].id')
+
+      #     echo "main=$RUN_ID" | tee -a "$GITHUB_OUTPUT"
+
+      # - name: Download coverage summary of main branch
+      #   uses: actions/download-artifact@v4
+      #   with:
+      #     path: ${{ github.run_id }}/main
+      #     run-id: ${{ steps.get_run_id.outputs.main }}
+
+      # - name: Get total coverage of main branch
+      #   shell: bash -x -e -u -o pipefail {0}
+      #   if: always()
+      #   id: total-main-branch
+      #   run: |
+      #     cd ${{ github.run_id }}/main
+      #     coverage combine --keep $(ls coverage-summary-*/.coverage)
+      #     coverage report 
+
+      #     REPORT=$(coverage report)
+      #     TOTAL=$(echo "$REPORT" | grep "TOTAL" | awk '{print substr($NF, 1, length($NF)-1)}')
+      #     echo "main=${TOTAL}" | tee -a "$GITHUB_OUTPUT"
+
+      # - name: Compare coverage change
+      #   shell: bash -x -e -u -o pipefail {0}
+      #   run: |
+      #     TOTAL_MAIN_BRANCH=${{ steps.total-main-branch.outputs.main }}
+      #     TOTAL_CURRENT_BRANCH=${{ steps.total-current-branch.outputs.main }}
+
+      #     test $TOTAL_CURRENT_BRANCH -ge $TOTAL_MAIN_BRANCH
+
   L0_Setup_Test_Data_And_Models:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Setup_Test_Data_And_Models') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L0_Setup_Test_Data_And_Models')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -398,7 +493,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_Bert:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Bert') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Bert')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -409,7 +504,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_Mamba2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Mamba2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Mamba2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -422,7 +517,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_Llama:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -434,7 +529,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_Llama3:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama3') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama3')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -446,7 +541,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_StarCoder:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_StarCoder') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_StarCoder')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -458,7 +553,7 @@ jobs:
   L2_Community_LLM_Checkpoints_tests_Falcon:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Falcon') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Falcon')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -470,7 +565,7 @@ jobs:
   L2_Community_vita_Checkpoints_tests_Llama3:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_vita_Checkpoints_tests_Llama3') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Community_vita_Checkpoints_tests_Llama3')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -516,7 +611,7 @@ jobs:
   L2_PTQ_Llama2_FP8:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_PTQ_Llama2_FP8') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_PTQ_Llama2_FP8')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -580,7 +675,7 @@ jobs:
   L2_Distill_Llama2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Distill_Llama2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Distill_Llama2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -607,7 +702,7 @@ jobs:
   L2_Prune_Width_Llama2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Prune_Width_Llama2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Prune_Width_Llama2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -628,7 +723,7 @@ jobs:
   L2_Prune_Depth_Llama2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Prune_Depth_Llama2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Prune_Depth_Llama2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -646,7 +741,7 @@ jobs:
   ASR_dev_run_Speech_to_Text:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -661,7 +756,7 @@ jobs:
   ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_CitriNet') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_CitriNet')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -679,7 +774,7 @@ jobs:
   ASR_dev_run_Speech_Pre-training_-_CitriNet:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_Pre-training_-_CitriNet') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_Pre-training_-_CitriNet')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -695,7 +790,7 @@ jobs:
   ASR_dev_run_Speech_To_Text_Finetuning:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_Finetuning') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_Finetuning')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -713,7 +808,7 @@ jobs:
   ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_HF_Finetuning') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_HF_Finetuning')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |-
@@ -749,7 +844,7 @@ jobs:
   ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_Conformer') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_Conformer')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -770,7 +865,7 @@ jobs:
   ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -791,7 +886,7 @@ jobs:
   L2_Speech_to_Text_EMA:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_to_Text_EMA') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_to_Text_EMA')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -807,7 +902,7 @@ jobs:
   L2_Speech_to_Text_AED:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_to_Text_AED') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_to_Text_AED')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -847,7 +942,7 @@ jobs:
   L2_Speaker_dev_run_Speaker_Recognition:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Recognition') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Recognition')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -866,7 +961,7 @@ jobs:
   L2_Speaker_dev_run_Speaker_Diarization:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -886,7 +981,7 @@ jobs:
   L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -901,7 +996,7 @@ jobs:
   L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -913,7 +1008,7 @@ jobs:
   L2_Speaker_dev_run_Speech_to_Label:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Speech_to_Label') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Speech_to_Label')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -936,7 +1031,7 @@ jobs:
   L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -954,7 +1049,7 @@ jobs:
   L2_Speaker_dev_run_Clustering_Diarizer_Inference:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Clustering_Diarizer_Inference') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Clustering_Diarizer_Inference')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -971,7 +1066,7 @@ jobs:
   L2_Speaker_dev_run_Neural_Diarizer_Inference:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Neural_Diarizer_Inference') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Neural_Diarizer_Inference')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -985,7 +1080,7 @@ jobs:
   L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1001,7 +1096,7 @@ jobs:
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1018,7 +1113,7 @@ jobs:
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1043,7 +1138,7 @@ jobs:
   L2_ASR_Adapters_Linear_Adapters:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ASR_Adapters_Linear_Adapters') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ASR_Adapters_Linear_Adapters')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1062,7 +1157,7 @@ jobs:
   L2_ASR_Adapters_RelPos_MHA_Adapters:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ASR_Adapters_RelPos_MHA_Adapters') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ASR_Adapters_RelPos_MHA_Adapters')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1083,7 +1178,7 @@ jobs:
   L2_Speech_Estimate_Duration_Bins:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Estimate_Duration_Bins') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Estimate_Duration_Bins')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1115,7 +1210,7 @@ jobs:
   L2_Speech_Batch_Size_OOMptimizer:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1134,7 +1229,7 @@ jobs:
   L2_Speech_Batch_Size_OOMptimizer_Canary:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer_Canary') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer_Canary')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1147,7 +1242,7 @@ jobs:
   L2_Speech_Transcription_Speech_to_Text_Transcribe:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Transcription_Speech_to_Text_Transcribe') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Transcription_Speech_to_Text_Transcribe')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1161,7 +1256,7 @@ jobs:
   L2_Speech_Transcription_Canary_Transcribe_Full_Manifest:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Full_Manifest') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Full_Manifest')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1180,7 +1275,7 @@ jobs:
   L2_Speech_Transcription_Canary_Transcribe_With_Prompt:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_With_Prompt') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_With_Prompt')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1203,7 +1298,7 @@ jobs:
   L2_Speech_Transcription_Canary_Transcribe_Audio_Dir:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Audio_Dir') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Audio_Dir')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1223,7 +1318,7 @@ jobs:
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1242,7 +1337,7 @@ jobs:
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1262,7 +1357,7 @@ jobs:
   L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1316,7 +1411,7 @@ jobs:
   L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1343,7 +1438,7 @@ jobs:
   L2_Pretraining_BERT_pretraining_from_Text:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Pretraining_BERT_pretraining_from_Text') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Pretraining_BERT_pretraining_from_Text')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1373,7 +1468,7 @@ jobs:
   L2_Pretraining_BERT_from_Preprocessed:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Pretraining_BERT_from_Preprocessed') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Pretraining_BERT_from_Preprocessed')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1406,7 +1501,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1471,7 +1566,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1499,7 +1594,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1526,7 +1621,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Inference:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Inference') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Inference')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1542,7 +1637,7 @@ jobs:
   L2_NMT_Attention_is_All_You_Need_Finetuning:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Finetuning') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Finetuning')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1573,7 +1668,7 @@ jobs:
   L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -1602,7 +1697,7 @@ jobs:
   L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1621,7 +1716,7 @@ jobs:
   L2_Megatron_NMT_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_NMT_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_NMT_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1719,7 +1814,7 @@ jobs:
   L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1787,7 +1882,7 @@ jobs:
   L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1857,7 +1952,7 @@ jobs:
   L2_RAG_Pipeline_Indexing:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_RAG_Pipeline_Indexing') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_RAG_Pipeline_Indexing')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1875,7 +1970,7 @@ jobs:
   L2_RAG_Pipeline_Generating:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_RAG_Pipeline_Generating') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_RAG_Pipeline_Generating')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -1893,7 +1988,7 @@ jobs:
   L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
@@ -2003,7 +2098,7 @@ jobs:
   L2_Megatron_GPT_Skip_Train:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Skip_Train') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Skip_Train')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2031,7 +2126,7 @@ jobs:
   L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2206,7 +2301,7 @@ jobs:
   L2_Megatron_LM_To_NeMo_Conversion:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_LM_To_NeMo_Conversion') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_LM_To_NeMo_Conversion')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2321,7 +2416,7 @@ jobs:
   L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2391,7 +2486,7 @@ jobs:
   L2_Megatron_GPT_with_Drop_Optimizer_States_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_Drop_Optimizer_States_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_Drop_Optimizer_States_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2429,7 +2524,7 @@ jobs:
   L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2510,7 +2605,7 @@ jobs:
   L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2592,7 +2687,7 @@ jobs:
   OPTIONAL_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'OPTIONAL_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'OPTIONAL_L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2')
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
@@ -2704,7 +2799,7 @@ jobs:
   L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -2731,7 +2826,7 @@ jobs:
   L2_Megatron_GPT_Finetuning_PP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_PP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_PP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2798,7 +2893,7 @@ jobs:
   L2_Megatron_GPT_Finetuning_StarCoder_PP1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_StarCoder_PP1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_StarCoder_PP1')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -2829,7 +2924,7 @@ jobs:
   L2_Megatron_GPT_Reranker:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Reranker') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Reranker')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2852,7 +2947,7 @@ jobs:
   L2_Megatron_GPT_Embedding:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Embedding') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Embedding')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2890,7 +2985,7 @@ jobs:
   L2_Megatron_GPT_PEFT_Lora_PP2_O2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_PP2_O2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_PP2_O2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2939,7 +3034,7 @@ jobs:
   L2_Megatron_GPT_PEFT_Lora_TP2_O1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2_O1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2_O1')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2985,7 +3080,7 @@ jobs:
   L2_Megatron_GPT_PEFT_Lora_TP2SP1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2SP1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2SP1')
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
@@ -3039,7 +3134,7 @@ jobs:
   L2_Megatron_GPT_Eval:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Eval') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Eval')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3053,7 +3148,7 @@ jobs:
   L2_Megatron_GPT_Eval_PP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Eval_PP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_Eval_PP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3069,7 +3164,7 @@ jobs:
   L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3121,7 +3216,7 @@ jobs:
   L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3138,7 +3233,7 @@ jobs:
   L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3155,7 +3250,7 @@ jobs:
   L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3230,7 +3325,7 @@ jobs:
   L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3325,7 +3420,7 @@ jobs:
   L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3420,7 +3515,7 @@ jobs:
   OPTIONAL_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'OPTIONAL_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'OPTIONAL_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3490,7 +3585,7 @@ jobs:
   L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_T5_w_Mixture_of_Expert_Pretraining') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_T5_w_Mixture_of_Expert_Pretraining')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3532,7 +3627,7 @@ jobs:
   L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3611,7 +3706,7 @@ jobs:
   L2_Megatron_Core_T5_Eval:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Core_T5_Eval') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Core_T5_Eval')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3623,7 +3718,7 @@ jobs:
   L2_Megatron_Core_T5_PEFT_Lora_TP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Core_T5_PEFT_Lora_TP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Core_T5_PEFT_Lora_TP2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3671,7 +3766,7 @@ jobs:
   L2_VLM_HF_Transformer_PEFT:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3682,7 +3777,7 @@ jobs:
   L2_VLM_HF_Transformer_PEFT_FSDP:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_FSDP') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_FSDP')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3693,7 +3788,7 @@ jobs:
   L2_VLM_HF_Transformer_PEFT_4bit:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_4bit') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_PEFT_4bit')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3704,7 +3799,7 @@ jobs:
   L2_VLM_HF_Transformer_SFT_FSDP2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_SFT_FSDP2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_VLM_HF_Transformer_SFT_FSDP2')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3715,7 +3810,7 @@ jobs:
   L2_HF_Transformer_PEFT:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PEFT') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PEFT')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3726,7 +3821,7 @@ jobs:
   L2_HF_Transformer_PEFT_nemorun:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PEFT_nemorun') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PEFT_nemorun')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3737,7 +3832,7 @@ jobs:
   L2_HF_Transformer_PEFT_2gpu:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PEFT_2gpu') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PEFT_2gpu')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3748,7 +3843,7 @@ jobs:
   L2_HF_Transformer_PEFT_2gpu_nemorun:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PEFT_2gpu_nemorun') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PEFT_2gpu_nemorun')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3759,7 +3854,7 @@ jobs:
   L2_HF_Transformer_SFT_2gpu:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_2gpu') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_2gpu')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3770,7 +3865,7 @@ jobs:
   L2_HF_Transformer_SFT_FSDP2_2gpu:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_FSDP2_2gpu') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_FSDP2_2gpu')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3781,7 +3876,7 @@ jobs:
   L2_HF_Transformer_PT_2gpu:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3792,7 +3887,7 @@ jobs:
   L2_HF_Transformer_SFT_2gpu_nemorun:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_2gpu_nemorun') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_2gpu_nemorun')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3803,7 +3898,7 @@ jobs:
   L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3814,7 +3909,7 @@ jobs:
   L2_HF_Transformer_PT_2gpu_nemorun:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu_nemorun') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT_2gpu_nemorun')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3825,7 +3920,7 @@ jobs:
   L2_HF_Transformer_PT:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3836,7 +3931,7 @@ jobs:
   L2_HF_Transformer_PT_nemorun:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT_nemorun') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT_nemorun')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3847,7 +3942,7 @@ jobs:
   L2_HF_Transformer_SFT:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3858,7 +3953,7 @@ jobs:
   L2_HF_Transformer_SFT_nemorun:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_nemorun') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_nemorun')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3869,7 +3964,7 @@ jobs:
   L2_HF_Transformer_SFT_TE_Acceleration:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_TE_Acceleration') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SFT_TE_Acceleration')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3880,7 +3975,7 @@ jobs:
   L2_HF_Transformer_PT_TE_Acceleration:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT_TE_Acceleration') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_PT_TE_Acceleration')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3888,11 +3983,23 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
+  # L2: SpeechLM tests
+  L2_HF_Transformer_SpeechLM_SFT_2gpu:
+    needs: [pre-flight, cicd-test-container-build]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_HF_Transformer_SpeechLM_SFT_2gpu') || needs.pre-flight.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        TRANSFORMERS_OFFLINE=1 python tests/collections/speechlm/hf/sft.py --model /home/TestData/speechlm/whisper-small/ --max-steps 10 --devices 2 --strategy ddp
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
+
   # L2: Megatron Mock Data Generation
   L2_Megatron_Mock_Data_Generation_MockGPTDataset:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockGPTDataset') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockGPTDataset')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3908,7 +4015,7 @@ jobs:
   L2_Megatron_Mock_Data_Generation_MockT5Dataset:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockT5Dataset') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockT5Dataset')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3926,7 +4033,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_Tacotron_2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Tacotron_2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Tacotron_2')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -3952,7 +4059,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_WaveGlow:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_WaveGlow') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_WaveGlow')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3974,7 +4081,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_FastPitch:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_FastPitch') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_FastPitch')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4048,7 +4155,7 @@ jobs:
   L2_TTS_Fast_dev_runs_1_Hifigan:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Hifigan') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Hifigan')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4100,7 +4207,7 @@ jobs:
   Speech_Checkpoints_tests:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'Speech_Checkpoints_tests') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'Speech_Checkpoints_tests')
     with:
       RUNNER: self-hosted-azure-gpus-1
       TIMEOUT: 20
@@ -4116,7 +4223,7 @@ jobs:
   L2_Stable_Diffusion_Training:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Stable_Diffusion_Training') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_Stable_Diffusion_Training')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -4166,7 +4273,7 @@ jobs:
   L2_NeMo_2_GPT_Pretraining_no_transformer_engine:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_Pretraining_no_transformer_engine') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_Pretraining_no_transformer_engine')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4199,7 +4306,7 @@ jobs:
   L2_NeMo_2_llama3_pretraining_recipe:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_llama3_pretraining_recipe') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_llama3_pretraining_recipe')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4227,7 +4334,7 @@ jobs:
   L2_NeMo_2_llama3_fault_tolerance_plugin:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_llama3_fault_tolerance_plugin') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_llama3_fault_tolerance_plugin')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4247,7 +4354,7 @@ jobs:
   L2_NeMo_2_llama3_straggler_detection:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_llama3_straggler_detection') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_llama3_straggler_detection')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4267,7 +4374,7 @@ jobs:
   L2_NeMo_2_GPT_DDP_Param_Parity_check:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_DDP_Param_Parity_check') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_DDP_Param_Parity_check')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4284,7 +4391,7 @@ jobs:
   L2_NeMo_2_SSM_Pretraining:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -4298,7 +4405,7 @@ jobs:
   L2_NeMo_2_SSM_Finetuning:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -4312,7 +4419,7 @@ jobs:
   L2_NeMo_2_HF_MODEL_IMPORT:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_HF_MODEL_IMPORT') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_HF_MODEL_IMPORT')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4325,7 +4432,7 @@ jobs:
   L2_NeMo_2_jit_callback:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_jit_callback') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_jit_callback')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4335,7 +4442,7 @@ jobs:
   L2_NeMo_2_T5_Pretraining:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_T5_Pretraining') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_T5_Pretraining')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4359,7 +4466,7 @@ jobs:
   L2_NeMo_2_T5_Finetuning:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_T5_Finetuning') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_T5_Finetuning')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4374,7 +4481,7 @@ jobs:
   L2_NeMo_2_T5_LoRA:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_T5_LoRA') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_T5_LoRA')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4390,7 +4497,7 @@ jobs:
   L2_NeMo_2_NEVA_MOCK_TRAINING:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_TRAINING') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_TRAINING')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4402,7 +4509,7 @@ jobs:
   L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4415,7 +4522,7 @@ jobs:
   L2_NeMo_2_MLLAMA_MOCK_TRAINING:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_MLLAMA_MOCK_TRAINING') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_MLLAMA_MOCK_TRAINING')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4428,7 +4535,7 @@ jobs:
   L2_NeMo_2_Mixtral_Pretraining:
       needs: [pre-flight, cicd-test-container-build]
       uses: ./.github/workflows/_test_template.yml
-      if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mixtral_Pretraining') || needs.pre-flight.outputs.all == 'true'
+      if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mixtral_Pretraining')
       with:
         RUNNER: self-hosted-azure
         SCRIPT: |
@@ -4439,7 +4546,7 @@ jobs:
   L2_NeMo_2_GPT_SFT_TP1PP1_MBS1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS1')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4468,7 +4575,7 @@ jobs:
   L2_NeMo_2_GPT_SFT_TP1PP1_MBS2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4497,7 +4604,7 @@ jobs:
   L2_NeMo_2_GPT_SFT_TP1PP2_MBS2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP2_MBS2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP2_MBS2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4526,7 +4633,7 @@ jobs:
   L2_NeMo_2_GPT_SFT_TP2PP1_MBS2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP2PP1_MBS2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP2PP1_MBS2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4555,7 +4662,7 @@ jobs:
   L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4584,7 +4691,7 @@ jobs:
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4613,7 +4720,7 @@ jobs:
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4642,7 +4749,7 @@ jobs:
   L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4671,7 +4778,7 @@ jobs:
   L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4699,7 +4806,7 @@ jobs:
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4727,7 +4834,7 @@ jobs:
   L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4755,7 +4862,7 @@ jobs:
   L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4782,7 +4889,7 @@ jobs:
   L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_Chat:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_Chat') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_Chat')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4812,7 +4919,7 @@ jobs:
   L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4826,7 +4933,7 @@ jobs:
   L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4841,7 +4948,7 @@ jobs:
   OPTIONAL_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4857,7 +4964,7 @@ jobs:
   L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4872,7 +4979,7 @@ jobs:
   L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4887,7 +4994,7 @@ jobs:
   L2_NEMO_2_LoRA_MERGE:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NEMO_2_LoRA_MERGE') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NEMO_2_LoRA_MERGE')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4899,7 +5006,7 @@ jobs:
   L2_NEMO_2_LoRA_Export:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NEMO_2_LoRA_Export') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NEMO_2_LoRA_Export')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -4911,7 +5018,7 @@ jobs:
   L2_NEMO_2_LoRA_Inference:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NEMO_2_LoRA_Inference') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NEMO_2_LoRA_Inference')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -4928,7 +5035,7 @@ jobs:
   L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4937,7 +5044,7 @@ jobs:
   L2_NeMo_2_PTQ_Llama2_FP8:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_PTQ_Llama2_FP8') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_PTQ_Llama2_FP8')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4952,7 +5059,7 @@ jobs:
   L2_NeMo_2_Export_In_Framework:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Export_In_Framework') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Export_In_Framework')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4981,7 +5088,7 @@ jobs:
   L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING')
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
@@ -4996,7 +5103,7 @@ jobs:
   L2_NeMo_2_VLLM_EXPORT:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_VLLM_EXPORT') || needs.pre-flight.outputs.all == 'true'
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_VLLM_EXPORT')
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -5024,7 +5131,7 @@ jobs:
         rm -rf /tmp/vllm_from_nemo2
 
   Nemo_CICD_Test:
-    needs:
+    needs: 
       - pre-flight
       - cicd-import-tests
 
@@ -5164,6 +5271,7 @@ jobs:
       - L2_HF_Transformer_PT_2gpu
       - L2_HF_Transformer_PT_2gpu_nemorun
       - L2_HF_Transformer_PT_TE_Acceleration
+      - L2_HF_Transformer_SpeechLM_SFT_2gpu
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
       - L2_NeMo_2_T5_Pretraining
@@ -5212,7 +5320,7 @@ jobs:
       - L2_HF_Transformer_SFT_FSDP2_2gpu
       - L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
       - L2_NeMo_2_VLLM_EXPORT
-    if: always()
+    if: always() && github.event != 'push'
     runs-on: ubuntu-latest
     steps:
       - name: Evaluate conclusion
diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml
index 3730e0bcf955..b8d38b24b319 100644
--- a/.github/workflows/code-formatting.yml
+++ b/.github/workflows/code-formatting.yml
@@ -1,4 +1,4 @@
-name: Isort and Black Formatting; PyLint Docs check
+name: Isort and Black Formatting
 # Incrementally reformat only changed files with black, all files with isort
 #
 # Replaces pre-commit.ci, since it reformats all the files.
@@ -71,145 +71,3 @@ jobs:
         with:
             message: Apply isort and black reformatting
             commit: --signoff
-
-  check_pylint:
-    name: "check_pylint (strict-mode: ${{ matrix.strict-mode }})"
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write
-    env:
-      THRESHOLD: 1730937600  # On this date (2024/11/07) we decided to add Pylint. It shall only run in strict mode for files added past this date. For files prior to this date, we will only add a PR comment with PyLint's stdout. 
-    strategy:
-      matrix:
-        strict-mode: ["true", "false"]
-    steps:
-      - name: Checkout branch
-        uses: actions/checkout@v4
-        with:
-          # setup repository and ref for PRs, see
-          # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          ref: ${{ github.event.pull_request.head.ref }}
-          fetch-depth: 0
-
-      # https://github.com/tj-actions/changed-files
-      - name: Get changed files
-        id: changed-files
-        uses: tj-actions/changed-files@v44
-        with:
-          files: |
-            **.py
-
-      - name: Setup Python env
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-
-      - name: pylint
-        if: ${{ steps.changed-files.outputs.any_changed == 'true' && !contains( github.event.pull_request.labels.*.name, 'skip-docs') }}
-        id: pylint
-        env:
-          # only *.py files included
-          STRICT_MODE: ${{ matrix.strict-mode }}
-          CHANGED_FILES: "${{ steps.changed-files.outputs.all_changed_files }}"
-        run: |
-          pip install pylint
-
-          FILTERED=()
-          for file in $CHANGED_FILES; do
-            DATE=$(git log --format=%ad --date=unix "$file" | tail -1)
-
-            if [[ "$STRICT_MODE" == "true" ]]; then
-              if [[ "$DATE" -gt "$THRESHOLD" ]]; then
-                FILTERED+=("$file")
-              fi
-            else
-              if [[ "$DATE" -le "$THRESHOLD" ]]; then
-                FILTERED+=("$file")
-              fi
-            fi
-          done
-
-          if [ ${#FILTERED[@]} -eq 0 ]; then
-            echo "No files to check."
-            exit 0
-          fi
-          
-          echo "Will run on these files:
-          ${FILTERED[@]}"
-
-          set +e
-          LOG=$(pylint ${FILTERED[@]})          
-          EXIT_CODE=$?
-          set -e
-          
-          set +x
-          echo "OUTPUT<<EOF" >> $GITHUB_ENV
-          echo "$LOG" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-          echo "log=$LOG"
-          set -x
-
-          echo "exit-code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
-
-          if [[ "${{ matrix.strict-mode }}" == "true" ]]; then
-            HEADER="🚨 The following files must be fixed before merge!"
-          else
-            HEADER="🙏 The following files have warnings. In case you are familiar with these, please try helping us to improve the code base."
-          fi
-          echo "header=$HEADER" | tee -a "$GITHUB_OUTPUT"
-
-          exit $([[ "$EXIT_CODE" -ne 0 && "$STRICT_MODE" == "true" ]] && echo $EXIT_CODE || echo 0)
-
-      - name: Find Comment
-        if: ${{ always() }}
-        uses: peter-evans/find-comment@v3
-        id: fc
-        with:
-          issue-number: ${{ github.event.number }}
-          body-includes: <!-- pylint-output-strict-mode-${{ matrix.strict-mode }} -->
-
-      - name: Delete comment
-        if: ${{ always() && steps.fc.outputs.comment-id != '' }}
-        env:
-          GH_TOKEN: ${{ secrets.github_token }}
-          REPOSITORY: ${{ github.repository }}
-          COMMENT_ID: ${{ steps.fc.outputs.comment-id }}
-        run: |
-          curl -L \
-            -X DELETE \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer $GH_TOKEN" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/$REPOSITORY/issues/comments/$COMMENT_ID
-
-      - name: Add PR comment for PyLint 
-        if: ${{ always() && steps.pylint.outputs.exit-code != '0' }}
-        uses: peter-evans/create-or-update-comment@v4
-        with:
-          issue-number: ${{ github.event.number }}
-          body: |
-            <!-- pylint-output-strict-mode-${{ matrix.strict-mode }} -->
-            
-            beep boop 🤖: ${{ steps.pylint.outputs.header }}
-
-            ---
-
-            Your code was analyzed with PyLint. The following annotations have been identified:
-
-            ```
-            ${{ env.OUTPUT }}
-            ```
-
-            ---
-            
-            Mitigation guide:
-
-            * Add sensible and useful docstrings to functions and methods
-            * For trivial methods like getter/setters, consider adding `# pylint: disable=C0116` inside the function itself
-            * To disable multiple functions/methods at once, put a `# pylint: disable=C0116` before the first and a `# pylint: enable=C0116` after the last.
-
-            By applying these rules, we reduce the occurance of this message in future.
-
-            Thank you for improving NeMo's documentation!
diff --git a/.github/workflows/code-linting.yml b/.github/workflows/code-linting.yml
new file mode 100644
index 000000000000..e8025e481fe4
--- /dev/null
+++ b/.github/workflows/code-linting.yml
@@ -0,0 +1,137 @@
+name: PyLint and flake8 linting
+
+on:
+  pull_request:
+    paths:
+      - '**.py'
+    types: [ opened, synchronize, reopened, labeled, unlabeled ]
+  
+jobs:
+  linting:
+    name: 'Domain: ${{ matrix.domain }}'
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix: 
+        domain: [speech, other]
+    env:
+      DOMAIN: ${{ matrix.domain }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Select filter
+        id: filter
+        run: |
+          if [[ "$DOMAIN" == "speech" ]]; then
+            FILTER=$(jq -crn '[
+              "nemo/collections/asr/**",
+              "nemo/collections/tts/**",
+              "nemo/collections/audio/**",
+              "nemo/collections/multimodal/speech_llm/**",
+              "nemo/collections/speechlm/**"
+            ] | join(",")')
+
+          else
+            FILTER=$(jq -crn '[
+              "nemo/**",
+              "!nemo/collections/asr/**",
+              "!nemo/collections/tts/**",
+              "!nemo/collections/audio/**",
+              "!nemo/collections/multimodal/speech_llm/**",
+              "!nemo/collections/speechlm/**"
+            ] | join(",")')
+          fi
+          
+          echo "main=$FILTER" | tee -a "$GITHUB_OUTPUT"
+
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v44
+        with:
+          files: ${{ steps.filter.outputs.main }}
+          files_separator: ","
+          separator: " "
+    
+      - name: Run PyLint
+        id: pylint
+        env:
+          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+        run: |
+          if [[ -z "$CHANGED_FILES" ]]; then
+            echo Nothing to lint.
+            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          pip install pylint
+          set +e
+          pylint --output "pylintrc.$DOMAIN.txt" --rcfile ".pylintrc.$DOMAIN" ${CHANGED_FILES[@]}
+          echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
+
+      - name: Run flake8
+        id: flake8
+        env:
+          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+        run: |
+          if [[ -z "$CHANGED_FILES" ]]; then
+            echo Nothing to lint.
+            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          pip install flake8
+          set +e
+          flake8 --output "flake8.$DOMAIN.txt" --config ".flake8.$DOMAIN" ${CHANGED_FILES[@]}
+          echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
+
+      - name: Summary
+        env:
+          PYLINT: ${{ steps.pylint.outputs.exit-code == 0 }}
+          FLAKE8: ${{ steps.flake8.outputs.exit-code == 0 }}
+        run: |
+          
+          if [[ "$PYLINT" != "true" ]]; then
+            echo "Pylint output:" | tee -a $GITHUB_STEP_SUMMARY
+
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+            cat pylintrc.$DOMAIN.txt | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+          fi
+
+          if [[ "$FLAKE8" != "true" ]]; then  
+            echo "Flake8 output:" | tee -a $GITHUB_STEP_SUMMARY
+
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+            cat flake8.$DOMAIN.txt | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+          fi
+
+          if [[ "$PYLINT" != "true" ||  "$FLAKE8" != "true" ]]; then
+            echo "The following directories got scanned:" | tee -a $GITHUB_STEP_SUMMARY
+
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+            echo ${{ steps.filter.outputs.main }} | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+
+            exit 1
+          fi
+
+  Nemo_Linting_Test:
+    needs: linting
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Main
+        env:
+          RESULTS: ${{ toJson(needs.linting) }}
+        run: |
+          RESULT=$(echo "$RESULTS" | jq -r '.result')
+
+          if [[ "$RESULT" == "success" ]]; then
+            echo "All passed."
+            exit 0
+          else
+            echo "Some linting domains failed."
+            exit 1
+          fi
\ No newline at end of file
diff --git a/.github/workflows/monitor-vms.yml b/.github/workflows/monitor-vms.yml
index 0bb54524847a..722a4720b0e9 100644
--- a/.github/workflows/monitor-vms.yml
+++ b/.github/workflows/monitor-vms.yml
@@ -4,10 +4,11 @@ on:
   schedule:
     - cron: 0/15 * * * *
   workflow_dispatch:
-  
+
 jobs:
   pre-flight:
     runs-on: ubuntu-latest
+    if: github.repository_owner == 'NVIDIA'
     outputs:
       list-of-vms: ${{ steps.main.outputs.main }}
     environment: main
@@ -25,13 +26,13 @@ jobs:
 
           MATRIX=$(echo $RUNNERS \
             | jq -c '[
-                .runners[] 
+                .runners[]
                 | select(.status == "online")
                 | select(.name | contains("cpu") | not)
                 | {
-                  "vm": .name, 
+                  "vm": .name,
                   "n_gpus": [
-                    .labels[] 
+                    .labels[]
                     | select(.name | endswith("gpu")) | .name
                   ][0][:1]
                 }
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index c1cd763a0501..de4ff6f1bdc3 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -32,17 +32,11 @@ on:
 
 jobs: 
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.3
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.20.1
     with:
       release-ref: ${{ inputs.release-ref }}
-      image-name: nemo_container
-      dockerfile: Dockerfile.ci
-      image-label: nemo-core
-      build-args: |
-        IMAGE_LABEL=nemo-core
-      prune-filter-timerange: 24h
       python-package: nemo
-      container-workdir: /workspace
+      python-version: '3.10'
       library-name: Neural Modules
       dry-run: ${{ inputs.dry-run }}
       version-bump-branch: ${{ inputs.version-bump-branch }}
diff --git a/.pylintrc b/.pylintrc.other
similarity index 100%
rename from .pylintrc
rename to .pylintrc.other
diff --git a/.pylintrc.speech b/.pylintrc.speech
new file mode 100644
index 000000000000..9c8830567460
--- /dev/null
+++ b/.pylintrc.speech
@@ -0,0 +1,9 @@
+[MAIN]
+ignore-paths=tests
+max-line-length=119
+
+[MESSAGES CONTROL]
+disable=all
+
+enable=W0611
+# W0611: unused-import
diff --git a/examples/speechlm/sft/hf.py b/examples/speechlm/sft/hf.py
index 96e785dac97f..3a64ea62dcd3 100755
--- a/examples/speechlm/sft/hf.py
+++ b/examples/speechlm/sft/hf.py
@@ -27,6 +27,17 @@
 
 
 class LhotseHfNeMoDataset(torch.utils.data.Dataset):
+    """Class for a speechLM dataset
+
+    Args:
+        processor (AutoProcessor): the processor to use
+        tokenizer (AutoTokenizer): the tokenizer to use
+        decoder_mask_fill (int): Value to fill in decoder mask
+
+    Returns:
+        pl.LightningDataModule: the dataset to train with.
+    """
+
     def __init__(self, processor, tokenizer, decoder_mask_fill=-100):
         super().__init__()
         self.processor = processor
@@ -69,6 +80,7 @@ def __getitem__(self, cuts):
     # Models can be one of the supported ones by AutoModelForSpeechSeq2Seq such as
     # openai/whisper-large-v3 and facebook/s2t-small-librispeech-asr
     parser.add_argument('--model', default='openai/whisper-large-v3')
+    parser.add_argument('--data-path', type=str, required=True)
     parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
     parser.add_argument('--devices', default=1)
     parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
@@ -83,7 +95,7 @@ def __getitem__(self, cuts):
 
     config = OmegaConf.create(
         {
-            "cuts_path": "/opt/checkpoints/lhotse/libri/libri-train-5.jsonl.gz",
+            "cuts_path": args.data_path,
             "sample_rate": 16000,
             "shuffle": True,
             "num_workers": 2,
diff --git a/nemo/collections/audio/parts/submodules/flow.py b/nemo/collections/audio/parts/submodules/flow.py
index 748d4c6c6d3b..56e77389b2e0 100644
--- a/nemo/collections/audio/parts/submodules/flow.py
+++ b/nemo/collections/audio/parts/submodules/flow.py
@@ -234,7 +234,7 @@ def forward(
         if state_length is not None:
             state = mask_sequence_tensor(state, state_length)
 
-        for t in time_steps:
+        for t in time_steps[:-1]:
             time = t * torch.ones(state.shape[0], device=state.device)
 
             if estimator_condition is None:
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 5d558c2b451f..306484331fb7 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -36,6 +36,7 @@
 from nemo.collections.llm.gpt.data import (
     AlpacaDataModule,
     ChatDataModule,
+    CustomRetrievalDataModule,
     DollyDataModule,
     FineTuningDataModule,
     HFDatasetDataModule,
@@ -91,7 +92,9 @@
     Llama31Config405B,
     Llama32Config1B,
     Llama32Config3B,
+    Llama32EmbeddingConfig1B,
     LlamaConfig,
+    LlamaEmbeddingModel,
     LlamaModel,
     MaskedTokenLossReduction,
     MistralConfig7B,
@@ -150,6 +153,7 @@
 __all__ = [
     "MockDataModule",
     "T5MockDataModule",
+    "CustomRetrievalDataModule",
     "GPTModel",
     "GPTConfig",
     "gpt_data_step",
@@ -185,6 +189,8 @@
     "Nemotron4Config15B",
     "Nemotron4Config340B",
     "NemotronConfig",
+    "LlamaEmbeddingModel",
+    "Llama32EmbeddingConfig1B",
     "Phi3Config",
     "Phi3ConfigMini",
     "Phi3Model",
diff --git a/nemo/collections/llm/bert/loss.py b/nemo/collections/llm/bert/loss.py
index 6fd34a4d3fa3..3bbbdfbd8e49 100644
--- a/nemo/collections/llm/bert/loss.py
+++ b/nemo/collections/llm/bert/loss.py
@@ -99,6 +99,89 @@ def reduce(self, losses_reduced_per_micro_batch) -> torch.Tensor:
         return torch.tensor(0.0, device=torch.cuda.current_device())
 
 
+class HardNegativeRankingLoss(MegatronLossReduction):
+    """
+    This loss uses hard-negative samples.
+    The difference of this loss to the default MultipleNegativesRankingLoss
+    from Sentence Transformers is that the latter shares the hard negatives
+    as negatives for all examples, whereas this loss uses hard negatives
+    exclusively for the example they are associated.
+    """
+
+    def __init__(
+        self,
+        validation_step: bool = False,
+        val_drop_last: bool = True,
+        num_hard_negatives: int = 1,
+        scale: float = 50,
+        label_smoothing: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.validation_step = validation_step
+        self.val_drop_last = val_drop_last
+        self.num_hard_negatives = num_hard_negatives
+        self.scale = scale
+        self.cross_entropy_loss = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
+
+    def forward(
+        self, batch: Dict[str, torch.Tensor], forward_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        from megatron.core import parallel_state
+
+        cp_size = parallel_state.get_context_parallel_world_size()
+        if cp_size != 1:
+            raise NotImplementedError(f'CP is not supported for {self.__class__} yet.')
+
+        num_tensors_per_example = 2 + self.num_hard_negatives  # 1 query, 1 pos, num_hard_negatives negs
+        current_train_n_passages = 1 + self.num_hard_negatives
+        batch_size = forward_out.shape[0] // num_tensors_per_example
+        # Get Query, Key (Positives, Negatives)
+        # forward_out was chunked [(q1, k1), (q2, k2), ...]
+        chunks = forward_out.chunk(batch_size)
+        query = torch.stack([item[0] for item in chunks])
+        key = torch.cat([item[1:] for item in chunks])
+
+        assert key.shape[0] % query.shape[0] == 0, '{} % {} > 0'.format(key.shape[0], query.shape[0])
+        assert key.shape[0] / query.shape[0] == current_train_n_passages, '{} / {} != {}'.format(
+            key.shape[0], query.shape[0], current_train_n_passages
+        )
+        query_shape = query.shape
+        repeated_query = query.repeat(1, 1, current_train_n_passages).reshape(
+            query_shape[0] * current_train_n_passages, query_shape[1]
+        )
+        scores = torch.sum(repeated_query * key, dim=-1).reshape(query_shape[0], current_train_n_passages)
+        labels = torch.zeros(query_shape[0], dtype=torch.long, device=query.device)
+        scores *= self.scale
+        ce_loss = self.cross_entropy_loss(scores, labels)
+        reduced_loss = average_losses_across_data_parallel_group([ce_loss])
+        return ce_loss, {"avg": reduced_loss}
+
+    def reduce(self, losses_reduced_per_micro_batch) -> torch.Tensor:
+        """Taken from: https://github.com/NVIDIA/NeMo/blob/main
+        /nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L535-L552 ."""
+        if losses_reduced_per_micro_batch:
+            if "avg" in losses_reduced_per_micro_batch[0]:
+                loss_tensors_list = [loss_reduced["avg"] for loss_reduced in losses_reduced_per_micro_batch]
+                loss_tensor = torch.concat(loss_tensors_list)
+
+                return loss_tensor.mean()
+
+            # Get the total loss since micro batches sizes are not uniform
+            loss_sum_tensors_list: List[torch.Tensor] = [
+                loss_sum["loss_sum_and_ub_size"]
+                for loss_sum in losses_reduced_per_micro_batch
+                if loss_sum["loss_sum_and_ub_size"][1] > 0
+            ]
+            loss_sum = (
+                torch.vstack(loss_sum_tensors_list).sum(dim=0)
+                if len(loss_sum_tensors_list) > 0
+                else torch.tensor([0.0, 0.0], device=torch.cuda.current_device())
+            )
+            return loss_sum
+
+        return torch.tensor(0.0, device=torch.cuda.current_device())
+
+
 class BERTInBatchExclusiveHardNegativesRankingLoss(MegatronLossReduction):
     """
     This loss uses in-batch negative samples + hard-negative samples.
diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py
index 89b5a3dc4b54..fd8935d9c11a 100644
--- a/nemo/collections/llm/gpt/data/__init__.py
+++ b/nemo/collections/llm/gpt/data/__init__.py
@@ -19,6 +19,7 @@
 from nemo.collections.llm.gpt.data.hf_dataset import HFDatasetDataModule
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule, build_pretraining_datamodule
+from nemo.collections.llm.gpt.data.retrieval import CustomRetrievalDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
 __all__ = [
@@ -31,4 +32,5 @@
     "PreTrainingDataModule",
     "build_pretraining_datamodule",
     "SquadDataModule",
+    "CustomRetrievalDataModule",
 ]
diff --git a/nemo/collections/llm/gpt/data/retrieval.py b/nemo/collections/llm/gpt/data/retrieval.py
new file mode 100644
index 000000000000..058068e811e0
--- /dev/null
+++ b/nemo/collections/llm/gpt/data/retrieval.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os.path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from datasets import Dataset
+
+from nemo.collections.llm.bert.data.fine_tuning import FineTuningDataModule
+from nemo.collections.llm.gpt.data.core import get_dataset_root
+from nemo.utils import logging
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers import TokenizerSpec
+    from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
+
+
+# Custom Retrieval Data Module loaded with json file
+class CustomRetrievalDataModule(FineTuningDataModule):
+    """ """
+
+    def __init__(
+        self,
+        data_root: str,
+        dataset_identifier: str = "custom_retrieval_dataset",
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        force_redownload: bool = False,
+        delete_raw: bool = True,
+        seed: int = 1234,
+        memmap_workers: int = 1,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        packed_sequence_specs: Optional["PackedSequenceSpecs"] = None,
+        query_key: str = "question",
+        pos_doc_key: str = "pos_doc",
+        neg_doc_key: str = "neg_doc",
+        dataset_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        self.force_redownload = force_redownload
+        self.delete_raw = delete_raw
+
+        assert packed_sequence_specs is None, "RetrievalDataModule does not support packed sequences."
+        assert os.path.exists(data_root), "Data root does not exist."
+        self.query_key = query_key
+        self.pos_doc_key = pos_doc_key
+        self.neg_doc_key = neg_doc_key
+        self.unprocessed_root = data_root
+        super().__init__(
+            dataset_root=get_dataset_root(dataset_identifier),
+            seq_length=seq_length,
+            tokenizer=tokenizer,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+            seed=seed,
+            memmap_workers=memmap_workers,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            persistent_workers=persistent_workers,
+            dataset_kwargs=dataset_kwargs,
+        )
+
+    def prepare_data(self) -> None:
+        """Prepare data if not split already."""
+        if not self.train_path.exists() or self.force_redownload:
+            self._preprocess_and_split_data()
+        super().prepare_data()
+
+    def _preprocess_and_split_data(self, train_ratio: float = 0.95, val_ratio: float = 0.04):
+        logging.info(f"Preprocessing {self.__class__.__name__} to jsonl format and splitting...")
+
+        test_ratio = 1 - train_ratio - val_ratio
+        save_splits = {}
+        dataset = Dataset.from_list(json.load(open(self.unprocessed_root, 'r')))
+        split_dataset = dataset.train_test_split(test_size=val_ratio + test_ratio, seed=self.seed)
+        split_dataset2 = split_dataset['test'].train_test_split(
+            test_size=test_ratio / (val_ratio + test_ratio), seed=self.seed
+        )
+        save_splits['training'] = split_dataset['train']
+        save_splits['validation'] = split_dataset2['train']
+        save_splits['test'] = split_dataset2['test']
+
+        for split_name, dataset in save_splits.items():
+            output_file = self.dataset_root / f"{split_name}.jsonl"
+            with output_file.open("w", encoding="utf-8") as f:
+                for o in dataset:
+                    # We only write one positive document for now
+                    # All negative document are written
+                    pos_doc = o[self.pos_doc_key][0] if isinstance(o[self.pos_doc_key], list) else o[self.pos_doc_key]
+                    neg_doc = o[self.neg_doc_key] if isinstance(o[self.pos_doc_key], list) else [o[self.neg_doc_key]]
+                    f.write(json.dumps({"query": o[self.query_key], "pos_doc": pos_doc, "neg_doc": neg_doc}) + "\n")
+
+            logging.info(f"{split_name} split saved to {output_file}")
diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index 4e9448eaef2c..d9ab48e0ea51 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -64,6 +64,7 @@
     LlamaConfig,
     LlamaModel,
 )
+from nemo.collections.llm.gpt.model.llama_embedding import Llama32EmbeddingConfig1B, LlamaEmbeddingModel
 from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel, MistralNeMoConfig12B
 from nemo.collections.llm.gpt.model.mixtral import (
     MixtralConfig,
@@ -145,6 +146,8 @@
     "Nemotron3Config22B",
     "Nemotron4Config340B",
     "NemotronModel",
+    "LlamaEmbeddingModel",
+    "Llama32EmbeddingConfig1B",
     "Phi3Config",
     "Phi3ConfigMini",
     "Phi3Model",
diff --git a/nemo/collections/llm/gpt/model/hf_llama_embedding.py b/nemo/collections/llm/gpt/model/hf_llama_embedding.py
new file mode 100644
index 000000000000..ba89626ff45f
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/hf_llama_embedding.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.cache_utils import Cache
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import SequenceClassifierOutputWithPast
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaForSequenceClassification, LlamaModel
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+def pool(last_hidden_states: Tensor, attention_mask: Tensor, pool_type: str) -> Tensor:
+    """Pooling on last_hidden_states without pad tokens."""
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+
+    if pool_type == "avg":
+        emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    elif pool_type == "weighted_avg":
+        emb = last_hidden.sum(dim=1)
+    elif pool_type == "cls":
+        emb = last_hidden[:, 0]
+    elif pool_type == "last":
+        left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
+        if left_padding:
+            emb = last_hidden[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = last_hidden.shape[0]
+            emb = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
+    else:
+        raise ValueError(f"pool_type {pool_type} not supported")
+
+    return emb
+
+
+class LlamaBidirectionalConfig(LlamaConfig):
+    """LLamaBidirectionalConfig for LlamaBidirectionalModel."""
+
+    model_type = "llama_bidirec"
+
+    def __init__(
+        self,
+        pooling="avg",
+        temperature=1.0,
+        **kwargs,
+    ):
+        self.pooling = pooling
+        self.temperature = temperature
+        super().__init__(
+            **kwargs,
+        )
+
+
+class LlamaBidirectionalModel(LlamaModel):
+    """LlamaBidirectionalModel.
+    Attention has been adjusted to bidirectional.
+    """
+
+    config_class = LlamaBidirectionalConfig
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        for layer in self.layers:
+            layer.self_attn.is_causal = False
+        self.config._attn_implementation = "eager"
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        # Generates bi-directional attention.
+        causal_mask = _prepare_4d_attention_mask(attention_mask, input_tensor.dtype)
+        return causal_mask
+
+
+class LlamaBidirectionalForSequenceClassification(LlamaForSequenceClassification):
+    """The LLaMa Model transformer with a sequence classification head on top (linear layer)."""
+
+    config_class = LlamaBidirectionalConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        # Releasing the parameters of LlamaModel
+        # created by parent LlamaForSequenceClassification
+        del self.model
+
+        self.model = LlamaBidirectionalModel(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        pooled_hidden_states = pool(
+            last_hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            pool_type=self.config.pooling,
+        )
+
+        pooled_logits = self.score(pooled_hidden_states)
+        pooled_logits = pooled_logits / self.config.temperature
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/nemo/collections/llm/gpt/model/llama_embedding.py b/nemo/collections/llm/gpt/model/llama_embedding.py
new file mode 100644
index 000000000000..3d8edcc5121a
--- /dev/null
+++ b/nemo/collections/llm/gpt/model/llama_embedding.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Annotated, Callable, Dict, Literal, Optional, Union
+
+import einops
+import lightning.pytorch as L
+import torch
+import torch.nn.functional as F
+from megatron.core import parallel_state
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.spec_utils import ModuleSpec
+from torch import Tensor, nn
+
+import nemo.collections.llm.gpt.model.base as GPTBase
+from nemo.collections.llm.bert.loss import BERTInBatchExclusiveHardNegativesRankingLoss, HardNegativeRankingLoss
+from nemo.collections.llm.gpt.model import GPTConfig
+from nemo.collections.llm.gpt.model.llama import HFLlamaImporter, Llama32Config1B, LlamaConfig, LlamaModel
+from nemo.collections.llm.utils import Config
+from nemo.lightning import OptimizerModule, io
+from nemo.lightning.pytorch.utils import dtype_from_hf
+from nemo.utils.import_utils import safe_import
+
+if TYPE_CHECKING:
+    from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
+
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+_, HAVE_TE = safe_import("transformer_engine")
+
+
+def _local_layer_spec(config: "GPTConfig") -> ModuleSpec:
+    gpt_layer_spec = GPTBase.local_layer_spec(config)
+    gpt_layer_spec.submodules.self_attention.params['attn_mask_type'] = AttnMaskType.padding
+    return gpt_layer_spec
+
+
+def _transformer_engine_layer_spec(config: "GPTConfig") -> ModuleSpec:
+    gpt_layer_spec = GPTBase.transformer_engine_layer_spec(config)
+    gpt_layer_spec.submodules.self_attention.params['attn_mask_type'] = AttnMaskType.padding
+    return gpt_layer_spec
+
+
+def get_nv_embedding_layer_spec(config):
+    """Customized Layer Spec for NV Embedding Llama Model.
+    Bidirectional attention is enabled instead of causal masking.
+    """
+    if HAVE_TE:
+        return _transformer_engine_layer_spec(config)
+    else:
+        return _local_layer_spec(config)
+
+
+def nv_embedding_data_step(dataloder_iter) -> Dict[str, torch.Tensor]:
+    """Setup NVEmbedding Llama Model dataloader batch."""
+    batch = next(dataloder_iter)
+
+    _batch: dict
+    if isinstance(batch, tuple) and len(batch) == 3:
+        _batch = batch[0]
+    else:
+        _batch = batch
+
+    required_keys = set()
+    required_keys.add("attention_mask")
+
+    if parallel_state.is_pipeline_first_stage():
+        required_keys.add("input_ids")
+        required_keys.add("position_ids")
+
+    _batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()}
+    # slice batch along sequence dimension for context parallelism
+    output = GPTBase.get_batch_on_this_context_parallel_rank(_batch)
+
+    return output
+
+
+def nv_embedding_forward_step(model: L.LightningModule, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+    """
+    This subsets the batch keys to the ones actually used by forward pass of the model,
+    and then calls the model's forward pass. if "cu_seqsens" are defined in the batch,
+    then the packed sequence parameters are also passed to the model for forward pass efficiency.
+    """
+    forward_args = {
+        "input_ids": batch["input_ids"],
+        "attention_mask": batch["attention_mask"],
+        "position_ids": batch["position_ids"],
+    }
+    emb = model.encode(**forward_args)
+    return emb
+
+
+@dataclass
+class Llama32EmbeddingConfig1B(Llama32Config1B):
+    """Llama3.2 Embedding 1B Config"""
+
+    transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = get_nv_embedding_layer_spec
+    forward_step_fn: Callable = nv_embedding_forward_step
+    data_step_fn: Callable = nv_embedding_data_step
+
+    # Training Configs
+    truncation_method: Literal["left", "right"] = 'right'
+    num_hard_negatives: int = 4
+    ce_loss_scale: float = 50
+    label_smoothing: float = 0.0
+    in_batch_negatives: bool = False
+    negative_sample_strategy: Literal["random", "first"] = 'first'
+    add_bos: bool = True
+    add_eos: bool = False
+
+    def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MCoreGPTModel":
+        """Configure the NV Embedding Llama3.2 1B Model"""
+        model = super().configure_model(tokenizer, pre_process, post_process)
+        # post_process need to be overwritten to False after model init because
+        # final_layernorm is still needed and it will only be initialized when post_process is True in Mcore.
+        # And for forward(), we do not want to run through output_layer thus setting post_process to False.
+        model.post_process = False
+        return model
+
+
+def _average_pool(last_hidden_states: Tensor, attention_mask: Tensor):
+    """Average the hidden states on the non-masking tokens."""
+    # [sq, b, h] -> [b, sq, h]
+    last_hidden_states = einops.rearrange(last_hidden_states, 's b h -> b s h')
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+
+class LlamaEmbeddingModel(LlamaModel):
+    """NV Embedding Llama Model"""
+
+    def __init__(
+        self,
+        config: Annotated[Optional[LlamaConfig], Config[LlamaConfig]] = None,
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__(config or LlamaConfig(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
+
+    @property
+    def dataset_kwargs(self):
+        """Getter for dataset_kwargs from model config"""
+        return {
+            'num_hard_negatives': self.config.num_hard_negatives,
+            'negative_sample_strategy': self.config.negative_sample_strategy,
+            'add_bos': self.config.add_bos,
+            'add_eos': self.config.add_eos,
+        }
+
+    def encode(
+        self,
+        input_ids: torch.LongTensor,
+        position_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor,
+        decoder_input: Optional[torch.Tensor] = None,
+    ):
+        """Generate the embedding for the inputs.
+        It runs the forward and apply average pooling on the last hidden states of the model.
+        """
+        if attention_mask.ndim == 2:
+            # extend attention mask to [b, 1, 1, sq]
+            # Also convert attention mask to binary
+            extended_mask = attention_mask.unsqueeze(1).unsqueeze(1) < 0.5
+        elif attention_mask.ndim == 4:
+            assert attention_mask.shape[1] == 1 and attention_mask.shape[2] == 1, "Attention mask shape incorrect"
+            extended_mask = attention_mask
+            # Squeeze attention mask to [b, sq] for averaging pooling later
+
+            attention_mask = extended_mask.squeeze() < 0.5
+        else:
+            raise ValueError("Attention_mask shape incorrect")
+
+        output = self.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=extended_mask,
+            decoder_input=decoder_input,
+        )
+        embeddings = _average_pool(output, attention_mask)
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        return embeddings
+
+    @property
+    def training_loss_reduction(self) -> BERTInBatchExclusiveHardNegativesRankingLoss:  # pylint: disable=C0115,C0116
+        if not self._training_loss_reduction:
+            if self.config.in_batch_negatives:
+                loss_func = BERTInBatchExclusiveHardNegativesRankingLoss
+            else:
+                loss_func = HardNegativeRankingLoss
+            self._training_loss_reduction = loss_func(
+                validation_step=False,
+                num_hard_negatives=self.config.num_hard_negatives,
+                scale=self.config.ce_loss_scale,
+                label_smoothing=self.config.label_smoothing,
+            )
+
+        return self._training_loss_reduction
+
+    @property
+    def validation_loss_reduction(self) -> BERTInBatchExclusiveHardNegativesRankingLoss:  # pylint: disable=C0115,C0116
+        if not self._validation_loss_reduction:
+            if self.config.in_batch_negatives:
+                loss_func = BERTInBatchExclusiveHardNegativesRankingLoss
+            else:
+                loss_func = HardNegativeRankingLoss
+            self._validation_loss_reduction = loss_func(
+                validation_step=True,
+                num_hard_negatives=self.config.num_hard_negatives,
+                scale=self.config.ce_loss_scale,
+                label_smoothing=self.config.label_smoothing,
+            )
+
+        return self._validation_loss_reduction
+
+
+@io.model_importer(LlamaEmbeddingModel, "hf")
+class LlamaEmbeddingImporter(HFLlamaImporter):
+    """HF Importer for Llama Embedding Model"""
+
+    def init(self) -> LlamaEmbeddingModel:
+        return LlamaEmbeddingModel(self.config, tokenizer=self.tokenizer)
+
+    @property
+    def config(self) -> Llama32Config1B:
+        # pylint : disable=C0116
+        from transformers import LlamaConfig as HFLlamaConfig
+
+        source = HFLlamaConfig.from_pretrained(str(self))
+
+        def make_vocab_size_divisible_by(vocab_size):
+            base = 128
+            while vocab_size % base != 0:
+                base //= 2
+            return base
+
+        output = Llama32EmbeddingConfig1B(
+            num_layers=source.num_hidden_layers,
+            hidden_size=source.hidden_size,
+            ffn_hidden_size=source.intermediate_size,
+            num_attention_heads=source.num_attention_heads,
+            init_method_std=source.initializer_range,
+            layernorm_epsilon=source.rms_norm_eps,
+            num_query_groups=source.num_key_value_heads,
+            rotary_base=source.rope_theta,
+            gated_linear_unit=True,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(source.vocab_size),
+            share_embeddings_and_output_weights=getattr(source, "tie_word_embeddings", False),
+            fp16=(dtype_from_hf(source) == torch.float16),
+            bf16=(dtype_from_hf(source) == torch.bfloat16),
+            params_dtype=dtype_from_hf(source),
+        )
+
+        return output
+
+
+@io.model_exporter(LlamaEmbeddingModel, "hf")
+class LlamaEmbeddingExporter(io.ModelConnector[LlamaEmbeddingModel, "LlamaBidirectionalModel"]):
+    """HF Exporter for NV Embedding Llama Model.
+    Note that NV Embedding LLama uses customized LlamaBidirectionalConfig config.
+    """
+
+    def init(self, dtype=torch.bfloat16) -> "LlamaForCausalLM":
+        from transformers.modeling_utils import no_init_weights
+
+        from nemo.collections.llm.gpt.model.hf_llama_embedding import LlamaBidirectionalModel
+
+        LlamaBidirectionalModel.register_for_auto_class("AutoModel")
+        with no_init_weights(True):
+            return LlamaBidirectionalModel._from_config(self.config, torch_dtype=dtype)
+
+    def apply(self, output_path: Path) -> Path:
+        source, _ = self.nemo_load(str(self))
+        source_dtype = source.module.embedding.word_embeddings.weight.dtype
+        target = self.init(source_dtype)
+        target = self.convert_state(source, target)
+
+        target = target.cpu()
+        target.save_pretrained(output_path)
+        try:
+            tokenizer = self.tokenizer.tokenizer
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+                tokenizer.padding_side = source.config.truncation_method
+
+            tokenizer.save_pretrained(output_path)
+        except Exception:
+            logging.warning("Failed to save tokenizer")
+
+        return output_path
+
+    @property
+    def config(self):
+        """Get HF NV Embedding Llama Config."""
+        source: LlamaConfig = io.load_context(str(self), subpath="model.config")
+
+        from nemo.collections.llm.gpt.model.hf_llama_embedding import LlamaBidirectionalConfig
+
+        LlamaBidirectionalConfig.register_for_auto_class("AutoConfig")
+        return LlamaBidirectionalConfig(
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            max_position_embeddings=source.seq_length,
+            initializer_range=source.init_method_std,
+            rms_norm_eps=source.layernorm_epsilon,
+            num_key_value_heads=source.num_query_groups,
+            rope_theta=source.rotary_base,
+            vocab_size=self.tokenizer.vocab_size,
+            tie_word_embeddings=source.share_embeddings_and_output_weights,
+        )
+
+    def convert_state(self, source, target):
+        """Convert NeMo State dict to HF."""
+        mapping = {
+            "decoder.layers.*.self_attention.linear_proj.weight": "layers.*.self_attn.o_proj.weight",
+            "decoder.layers.*.mlp.linear_fc2.weight": "layers.*.mlp.down_proj.weight",
+            "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "layers.*.input_layernorm.weight",
+            "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "layers.*.post_attention_layernorm.weight",
+            "decoder.final_layernorm.weight": "norm.weight",
+        }
+        transforms = [_export_qkv, _export_linear_fc1, _export_embedding]
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=transforms,
+        )
+
+    @property
+    def tokenizer(self) -> "TokenizerSpec":
+        """Get NeMo Tokenizer"""
+        return io.load_context(str(self), subpath="model").tokenizer
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "layers.*.self_attn.q_proj.weight",
+        "layers.*.self_attn.k_proj.weight",
+        "layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_qkv(ctx: io.TransformCTX, linear_qkv):
+    megatron_config = ctx.source.config
+
+    head_num = megatron_config.num_attention_heads
+    num_query_groups = megatron_config.num_query_groups
+    heads_per_group = head_num // num_query_groups
+    hidden_size = megatron_config.hidden_size
+    head_size = megatron_config.kv_channels
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, hidden_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+@io.state_transform(
+    source_key="embedding.word_embeddings.weight",
+    target_key="embed_tokens.weight",
+)
+def _export_embedding(ctx: io.TransformCTX, embedding):
+    megatron_config = ctx.target.config
+    # prune padding.
+    return embedding[: megatron_config.vocab_size, :]
+
+
+@io.state_transform(
+    source_key="decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=("layers.*.mlp.gate_proj.weight", "layers.*.mlp.up_proj.weight"),
+)
+def _export_linear_fc1(linear_fc1):
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+
+    return gate_proj, up_proj
diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
index 0a87480f31d9..42e70d8c31d6 100644
--- a/nemo/collections/llm/inference/base.py
+++ b/nemo/collections/llm/inference/base.py
@@ -149,6 +149,7 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
     trainer.strategy._setup_optimizers = False
     trainer.ckpt_path = None
     trainer.strategy.connect(model)
+    model.trainer = trainer
     if trainer.strategy.launcher is not None:
         trainer.strategy.launcher.launch(lambda: None, trainer=trainer)
     trainer.strategy.setup_environment()
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
index 09291e4165be..0892bb10f16b 100644
--- a/nemo/collections/llm/recipes/__init__.py
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -38,6 +38,7 @@
     llama31_405b,
     llama32_1b,
     llama32_3b,
+    llama_embedding_1b,
     mamba2_1_3b,
     mamba2_2_7b,
     mamba2_8b,
diff --git a/nemo/collections/llm/recipes/llama_embedding_1b.py b/nemo/collections/llm/recipes/llama_embedding_1b.py
new file mode 100644
index 000000000000..4a26fcc563d3
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama_embedding_1b.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import lightning.pytorch as pl
+import nemo_run as run
+import torch
+from lightning.pytorch.callbacks.callback import Callback
+from megatron.core.distributed import DistributedDataParallelConfig
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm import Llama32EmbeddingConfig1B, LlamaEmbeddingModel
+from nemo.collections.llm.api import finetune
+from nemo.collections.llm.peft import PEFT_STR2CLS
+from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
+from nemo.utils.exp_manager import TimingCallback
+
+NAME = "nvembed_llama_1b"
+
+
+@run.cli.factory(name=NAME)
+def model() -> run.Config[pl.LightningModule]:
+    """
+    Factory function to create a NVEmbed Llama3.2 1B model configuration.
+
+    Returns:
+        run.Config[pl.LightningModule]: Configuration for the NVEmbed Llama3.2 1B model.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain model=nvembed_llama_1b ...
+
+        Python API usage:
+            >>> model_config = model()
+            >>> print(model_config)
+    """
+    return run.Config(LlamaEmbeddingModel, config=run.Config(Llama32EmbeddingConfig1B))
+
+
+def trainer(
+    tensor_parallelism: int = 1,
+    pipeline_parallelism: int = 1,
+    pipeline_parallelism_type: Optional[torch.dtype] = None,
+    virtual_pipeline_parallelism: Optional[int] = None,
+    context_parallelism: int = 2,
+    sequence_parallelism: bool = False,
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    max_steps: int = 1168251,
+    callbacks: Optional[list[run.Config[Callback]]] = None,
+) -> run.Config[nl.Trainer]:
+    """
+    Configure the NeMo Lightning Trainer for NVEmbed Llama3.2 1B model.
+
+    This function sets up the distributed training strategy and other training parameters.
+
+    Args:
+        tensor_parallelism (int): Degree of tensor model parallelism.
+        pipeline_parallelism (int): Degree of pipeline model parallelism.
+        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
+        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
+        context_parallelism (int): Degree of context parallelism.
+        sequence_parallelism (bool): Whether to use sequence parallelism.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        max_steps (int): Maximum number of training steps.
+        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.
+
+    Returns:
+        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.
+
+    Examples:
+        CLI usage:
+            $ nemo llm pretrain trainer=nvembed_llama_1b ...
+
+        Python API usage:
+            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
+            >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
+    """
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        tensor_model_parallel_size=tensor_parallelism,
+        pipeline_model_parallel_size=pipeline_parallelism,
+        pipeline_dtype=pipeline_parallelism_type,
+        virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism,
+        context_parallel_size=context_parallelism,
+        sequence_parallel=sequence_parallelism,
+        gradient_as_bucket_view=True,
+        ckpt_async_save=True,
+        ckpt_parallel_load=True,
+        ddp=run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        ),
+    )
+
+    trainer = run.Config(
+        nl.Trainer,
+        accelerator="gpu",
+        accumulate_grad_batches=1,
+        callbacks=callbacks,
+        devices=num_gpus_per_node,
+        limit_test_batches=50,
+        limit_val_batches=32,
+        log_every_n_steps=10,
+        max_steps=max_steps,
+        num_nodes=num_nodes,
+        plugins=bf16_mixed(),
+        strategy=strategy,
+        use_distributed_sampler=False,
+        val_check_interval=2000,
+    )
+
+    return trainer
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    resume_path: str = "meta-llama/Llama-3.2-1B",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    micro_batch_size: int = 4,
+    global_batch_size: int = 64,
+    peft_scheme: Optional[str] = 'lora',
+    seq_length: Optional[int] = None,
+    packed_sequence: Optional[bool] = None,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for NVEmbed Llama3.2 1B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        resume_path (str): Path to the Huggingface model or pretrained distributed checkpoint for resume
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        micro_batch_size (int): Size of micro batch.
+        global_batch_size (int): Size of global batch.
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+        seq_length (int): Maximum number of tokens per microbatch.
+        packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
+            maximum seq_length for better efficiency. pack sequence is not supported for embedding model training.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory nvembed_llama_1b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="nvembed_llama_1b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SPECTER dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    if seq_length is None:
+        seq_length = 512
+
+    assert packed_sequence is None, 'pack_sequence is not supported for Embedding model finetuning.'
+    recipe = default_finetune_recipe(model(), resume_path, dir, name, num_nodes, num_gpus_per_node, packed_sequence)
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.strategy.tensor_model_parallel_size = 1
+        recipe.optim.config.lr = 5e-6
+    elif peft_scheme.lower() in ['lora', 'dora']:
+        recipe.peft = run.Config(PEFT_STR2CLS[peft_scheme.lower()])
+        recipe.peft.dim = 8
+        recipe.peft.alpha = 16
+        recipe.optim.config.use_distributed_optimizer = False
+
+        # some settings currently do not function correctly with LoRA
+        recipe.model.config.cross_entropy_loss_fusion = False
+
+        recipe.optim.config.lr = 1e-4
+    else:
+        raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")
+
+    # Sequence length settings in the model and dataset must agree
+    recipe.model.config.seq_length = seq_length
+    # Use Specter Dataset as the default for finetuning
+    recipe.data = run.Config(
+        llm.SpecterDataModule,
+        seq_length=seq_length,
+        micro_batch_size=micro_batch_size,
+        global_batch_size=global_batch_size,
+        dataset_kwargs={
+            'num_hard_negatives': recipe.model.config.num_hard_negatives,
+            'negative_sample_strategy': recipe.model.config.negative_sample_strategy,
+            'add_bos': recipe.model.config.add_bos,
+            'add_eos': recipe.model.config.add_eos,
+        },
+    )
+
+    return recipe
+
+
+def finetune_performance_optimizations(
+    recipe: run.Partial,
+    peft_scheme: str,
+) -> run.Partial:
+    """
+    Modify the given recipe to optimize settings for performance.
+
+    This method enables performance optimizations that may not be suitable for all use cases.
+    Intended to build upon the standard fine-tuning recipe.
+
+    Args:
+        recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
+        peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning.
+            Allowed values: 'lora'/'dora'/'none'/None.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized fine-tuning.
+
+    Note:
+        Use this method with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe.trainer.strategy.tensor_model_parallel_size = 1
+
+    if not hasattr(recipe.trainer, "callbacks"):
+        recipe.trainer.callbacks = []
+
+    if peft_scheme is None or peft_scheme.lower() == 'none':
+        recipe.trainer.plugins.grad_reduce_in_fp32 = False
+        recipe.trainer.strategy.ddp = run.Config(
+            DistributedDataParallelConfig,
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=False,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+        )
+        recipe.trainer.callbacks.append(
+            run.Config(
+                MegatronCommOverlapCallback,
+                tp_comm_overlap=False,
+            )
+        )
+    else:
+        recipe.peft.target_modules = ['linear_qkv']
+
+    recipe.trainer.callbacks.append(run.Config(TimingCallback))
+    recipe.trainer.callbacks.append(
+        run.Config(
+            GarbageCollectionCallback,
+            100,
+            100,
+        )
+    )
+
+    return recipe
diff --git a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py b/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
index 8bca618dce3d..0da7af6ed96d 100644
--- a/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
+++ b/nemo/collections/nlp/data/information_retrieval/bert_embedding_dataset.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from random import choices, sample
-from typing import Mapping, Optional
+from typing import Literal, Mapping, Optional
 
 import datasets
 import numpy as np
@@ -32,6 +32,10 @@
 
 
 class BertEmbeddingDataset(Dataset):
+    """
+    Embedding Dataset Class.
+    """
+
     def __init__(
         self,
         file_path: str,
@@ -49,19 +53,28 @@ def __init__(
         special_tokens: Optional[Mapping[str, str]] = None,  # special tokens, a dictory of {token_type: token}
         data_type: str = 'train',  # train, query or doc
         num_hard_negatives: int = 4,
+        negative_sample_strategy: Literal["random", "first"] = 'first',
     ):
         """
         file_path: Path to a JSONL dataset with (query,pos_doc,neg_doc) triplets in jsonl format.
-        tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec (ex: YTTM, SentencePiece).
-        max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated.
-        min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements.
+        tokenizer: Tokenizer for the dataset. Instance of a class that inherits TokenizerSpec
+                   (ex: YTTM, SentencePiece).
+        max_seq_length (int): maximum sequence length for each dataset examples.
+                   Examples will either be truncated to fit this length or dropped if they cannot be truncated.
+        min_seq_length (int): min length of each data example in the dataset.
+                   Data examples will be dropped if they do not meet the min length requirements.
         add_bos (bool): Whether to add a beginning of sentence token to each data example
         add_eos (bool): Whether to add an end of sentence token to each data example
         seed: Random seed for data shuffling.
-        max_num_samples: Maximum number of samples to load. This can be > dataset length if you want to oversample data. If None, all samples will be loaded.
-        index_mapping_dir: Directory to save the index mapping to. If None, will write to the same folder as the dataset.
+        max_num_samples: Maximum number of samples to load. This can be > dataset length
+                   if you want to oversample data. If None, all samples will be loaded.
+        index_mapping_dir: Directory to save the index mapping to.
+                   If None, will write to the same folder as the dataset.
         truncation_method: Truncation from which position. Options: ['left', 'right']
-        special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>', 'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
+        special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}.
+                   Default: {'system_turn_start': '<extra_id_0>', 'turn_start': '<extra_id_1>',
+                   'label_start': '<extra_id_2>', 'end_of_turn': '\n', "end_of_name": "\n"}
+        negative_sample_strategy: Strategy for negative samples. Options: ['random', 'first']
         """
         # TODO: lot of copy-paste from GPTSFDDataset, should refactor both to use a common base class (@adithyare)
         self.tokenizer = tokenizer
@@ -75,6 +88,14 @@ def __init__(
         self.index_mapping_dir = index_mapping_dir
         self.virtual_tokens = virtual_tokens
         self.truncation_method = truncation_method
+        self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id else self.tokenizer.eos_id
+        self.negative_sample_strategy = negative_sample_strategy
+        assert (
+            truncation_method == 'left' or truncation_method == 'right'
+        ), 'truncation_method must be either "left" or "right"'
+        assert (
+            negative_sample_strategy == 'random' or negative_sample_strategy == 'first'
+        ), 'negative_sample_strategy must be either "random" or "first"'
         if special_tokens is None:
             self.special_tokens = {
                 "system_turn_start": "<extra_id_0>",
@@ -98,6 +119,13 @@ def __init__(
         # Will be None after this call if `max_num_samples` is None
         self.samples_mapping = None
         self._build_samples_mapping()
+        logging.info(
+            f"Creating EmbeddingDataset with seed={self.seed},\n"
+            f"add_bos={self.add_bos}, add_eos={self.add_eos},\n"
+            f"max_seq_length={self.max_seq_length}, min_seq_length={self.min_seq_length},\n"
+            f"pad_token_id={self.pad_token_id}, negative_sample_strategy={self.negative_sample_strategy},\n"
+            f"num_hard_negatives={self.num_hard_negatives}."
+        )
 
     def _build_samples_mapping(self):
         if self.max_num_samples is not None:
@@ -169,8 +197,13 @@ def _process_example(self, example):
                 # sample rest with replacement
                 nd = nd + choices(example['neg_doc'], k=self.num_hard_negatives - len(example['neg_doc']))
             else:
-                # sample without replacement
-                nd = sample(example['neg_doc'], k=self.num_hard_negatives)
+                if self.negative_sample_strategy == 'random':
+                    # sample without replacement
+                    # Choose the first self.num_hard_negatives
+                    nd = sample(example['neg_doc'], k=self.num_hard_negatives)
+                else:
+                    # Choose the first self.num_hard_negatives samples
+                    nd = example['neg_doc'][: self.num_hard_negatives]
             assert len(nd) == self.num_hard_negatives, "Error in sampling required number of hard negatives"
             nd = [self.tokenizer.text_to_ids("passage: " + ex.strip()) for ex in nd]
 
@@ -228,27 +261,17 @@ def _maybe_cast_to_list(self, x):
     def _ceil_to_nearest(self, n, m):
         return (n + m - 1) // m * m
 
-    def _collate_item(self, item, max_length, pad_id):
+    def _collate_item(self, item, max_length):
         item = self._maybe_cast_to_list(item)
-        # max_length = max([len(x) for x in item]) if item else 0
-        # here [0] should be tokenizer.pad_id
-        item = [x + [pad_id] * (max_length - len(x)) for x in item]
+        pad_id = self.pad_token_id
+        if self.truncation_method == 'left':
+            item = [[pad_id] * (max_length - len(x)) + x for x in item]
+        else:
+            item = [x + [pad_id] * (max_length - len(x)) for x in item]
         return item
 
     @torch.no_grad()
-    def _create_attention_mask(self, max_length):
-        """Create `attention_mask`.
-        Args:
-            input_ids: A 1D tensor that holds the indices of tokens.
-        """
-        # seq_length = len(input_ids)
-        # `attention_mask` has the shape of [1, seq_length, seq_length]
-        attention_mask = torch.tril(torch.ones((max_length, max_length))).unsqueeze(0)
-        attention_mask = attention_mask < 0.5
-        return attention_mask
-
-    @torch.no_grad()
-    def _create_attention_mask2(self, max_length, item_lengh):
+    def _create_attention_mask2(self, max_length, item_length):
         """Create `attention_mask`.
         Args:
             input_ids: A 1D tensor that holds the indices of tokens.
@@ -256,10 +279,20 @@ def _create_attention_mask2(self, max_length, item_lengh):
         # seq_length = len(input_ids)
         # `attention_mask` has the shape of [1, seq_length, seq_length]
         attention_mask = torch.zeros(max_length)
-        attention_mask[:item_lengh] = 1
+        if self.truncation_method == 'left':
+            # input ids:      [pad] [pad] token token |
+            # attention mask: 0      0    1     1
+            attention_mask[max_length - item_length :] = 1
+        else:
+            # input ids:      token token [pad] [pad] |
+            # attention mask: 1     1     0      0
+            attention_mask[:item_length] = 1
         return attention_mask
 
-    def collate_fn(self, batch):
+    def _collate_fn(self, batch):
+        """
+        Collate query passage together
+        """
         input_ids = []
         metadata = []
         lengths = []
@@ -295,7 +328,7 @@ def collate_fn(self, batch):
         attention_mask = torch.stack(attention_mask)
         position_ids = [list(range(max_length)) for _ in batch]
         position_ids = torch.LongTensor(position_ids)
-        input_ids = torch.LongTensor(self._collate_item(input_ids, max_length=max_length, pad_id=0))
+        input_ids = torch.LongTensor(self._collate_item(input_ids, max_length=max_length))
         lengths = torch.LongTensor(lengths) - 1  # subtract 1 to account for the eos token
 
         processed_batch = {
@@ -303,6 +336,7 @@ def collate_fn(self, batch):
             'token_type_ids': torch.zeros_like(input_ids),
             'attention_mask': attention_mask,
             'metadata': metadata,
+            'position_ids': position_ids,
         }
 
         return processed_batch
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
index 7c3f3c194f14..bc4d103285a7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_decoder_layer.py
@@ -111,6 +111,7 @@ def forward(
         attention_bias=None,
         inference_params=None,
         packed_seq_params=None,
+        sequence_len_offset=None,
     ):
         # hidden_states: [s, b, h]
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index 1def214113ee..955bc659eb42 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -255,6 +255,7 @@ def forward(
         attention_bias=None,
         inference_params=None,
         packed_seq_params=None,  # TODO: handle this
+        sequence_len_offset=None,  # TODO: handle this
     ):
         # Use is_first_microbatch argument during CUDA graph capture. Use self.is_first_microbatch otherwise.
         hidden_states = super().forward(
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index bba990750adb..e426212bcca6 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -85,6 +85,7 @@ def forward(
         attention_bias: Tensor = None,
         inference_params: InferenceParams = None,
         packed_seq_params: PackedSeqParams = None,
+        sequence_len_offset=None,
     ):
         hidden_states = super().forward(
             hidden_states=hidden_states,
@@ -234,6 +235,7 @@ def forward(
         rotary_pos_cos=None,
         rotary_pos_sin=None,
         attention_bias=None,
+        sequence_len_offset=None,
     ):
         # hidden_states: [sq, b, h]
 
diff --git a/nemo/collections/vlm/mllama/model/vision.py b/nemo/collections/vlm/mllama/model/vision.py
index bb58ad093cd6..6ecd51ecf29d 100644
--- a/nemo/collections/vlm/mllama/model/vision.py
+++ b/nemo/collections/vlm/mllama/model/vision.py
@@ -23,7 +23,6 @@
 import torch.nn.functional as F
 from megatron.core import InferenceParams, parallel_state, tensor_parallel
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
-
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.custom_layers.transformer_engine import (
@@ -494,6 +493,7 @@ def forward(
         attention_bias=None,
         inference_params=None,
         packed_seq_params=None,
+        sequence_len_offset=None,
     ):
         """Forward."""
         # hidden_states: [s, b, h]
diff --git a/nemo/lightning/pytorch/utils.py b/nemo/lightning/pytorch/utils.py
index 8072f10d7a45..a1f774cd11bc 100644
--- a/nemo/lightning/pytorch/utils.py
+++ b/nemo/lightning/pytorch/utils.py
@@ -38,7 +38,7 @@ def dtype_from_str(dtype):
     assert isinstance(dtype, str)
     if dtype in ["float16", "fp16", "16", "16-mixed"]:
         return torch.float16
-    elif dtype == ["bfloat16", "bf16-mixed"]:
+    elif dtype in ["bfloat16", "bf16-mixed"]:
         return torch.bfloat16
     else:
         return torch.float32
@@ -62,11 +62,7 @@ def is_trainer_attached(model: pl.LightningModule):
     """
     Returns true if trainer is attached to a model
     """
-    try:
-        trainer = model.trainer
-        return True
-    except (AttributeError, RuntimeError):
-        return False
+    return hasattr(model, 'trainer')
 
 
 def get_huggingface_model_from_trainer(trainer: 'lightning.pytorch.Trainer') -> 'nn.Module':
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 7f98ae8f85d2..664d8cd1961f 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -24,10 +24,9 @@
 from lightning.pytorch.loggers import WandbLogger
 from nemo_run.core.serialization.yaml import YamlSerializer
 
-from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
+from nemo.lightning.pytorch.callbacks import MemoryProfileCallback, NsysCallback, PreemptionCallback
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.utils import logging
-
 from nemo.utils.import_utils import safe_import
 
 res_module, HAVE_RES = safe_import('nvidia_resiliency_ext.ptl_resiliency')
@@ -174,6 +173,34 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
         launcher.nsys_trace = self.nsys_trace or ["nvtx", "cuda"]
 
 
+@dataclass(kw_only=True)
+class MemoryProfilePlugin(run.Plugin):
+    """
+    A plugin for memory profiling.
+
+    The MemoryProfilePlugin allows you to profile a timeline of memory allocations during you run.
+    The memory profiling plugin creates snapshots during the entire training. You can specify which ranks to run the profiling.
+
+    Args:
+        dir (str): Directory to store the memory profile dump .pickle files
+        ranks (Optional[list[int]]): The ranks on which to run the memory profiling. If not specified,
+            profiling will be run on rank 0.
+    """
+
+    dir: str
+    ranks: Optional[list[int]] = None
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        if isinstance(task, run.Partial):
+            memprof_callback = run.Config(
+                MemoryProfileCallback,
+                dir=self.dir,
+                ranks=self.ranks or [0],
+            )
+            callbacks: list[run.Config[Callback]] = [memprof_callback]  # type: ignore
+            _merge_callbacks(task, callbacks=callbacks)
+
+
 @dataclass(kw_only=True)
 class WandbPlugin(run.Plugin):
     """
diff --git a/nemo/package_info.py b/nemo/package_info.py
index 1d69deff96d3..01fac3e22139 100644
--- a/nemo/package_info.py
+++ b/nemo/package_info.py
@@ -16,7 +16,7 @@
 MAJOR = 2
 MINOR = 2
 PATCH = 0
-PRE_RELEASE = 'rc0'
+PRE_RELEASE = 'rc1'
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
diff --git a/reinstall.sh b/reinstall.sh
index 030e6252a0c6..51b0749f6353 100755
--- a/reinstall.sh
+++ b/reinstall.sh
@@ -24,7 +24,7 @@ export NEMO_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
 export APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 export CAUSAL_CONV_TAG=v1.2.2.post1
 export MAMBA_TAG=v2.2.0
-export MCORE_TAG=4dc8977167d71f86bdec47a60a98e85c4cfa0031
+export MCORE_TAG=0e85db539cf16816ffced6e7dac644d91ffadc04
 export NV_RESILIENCY_EXT_TAG=97aad77609d2e25ed38ac5c99f0c13f93c48464e
 
 if [ -n "${NVIDIA_PYTORCH_VERSION}" ]; then
diff --git a/requirements/requirements_deploy.txt b/requirements/requirements_deploy.txt
index a65b651a76c8..e557bdc08a01 100644
--- a/requirements/requirements_deploy.txt
+++ b/requirements/requirements_deploy.txt
@@ -3,4 +3,4 @@ nvidia-pytriton
 pydantic-settings
 tensorstore
 uvicorn
-zarr
+zarr>=2.18.2,<3.0.0
diff --git a/requirements/requirements_infer.txt b/requirements/requirements_infer.txt
index 47daf571d26f..0a38d92555db 100644
--- a/requirements/requirements_infer.txt
+++ b/requirements/requirements_infer.txt
@@ -5,4 +5,4 @@ nvidia-pytriton
 pydantic-settings
 tensorstore
 uvicorn
-zarr
+zarr>=2.18.2,<3.0.0
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 6a0ae8adf66c..91005637c365 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -22,4 +22,4 @@ sacrebleu  # manually install sacrebleu[ja] for Japanese support; MeCab is unsup
 sentence_transformers
 tensorstore
 tiktoken==0.7.0
-zarr
+zarr>=2.18.2,<3.0.0
diff --git a/tests/collections/audio/test_audio_flowmatching.py b/tests/collections/audio/test_audio_flowmatching.py
new file mode 100644
index 000000000000..baafc3d31076
--- /dev/null
+++ b/tests/collections/audio/test_audio_flowmatching.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from nemo.collections.audio.parts.submodules.flow import ConditionalFlowMatchingEulerSampler
+
+NUM_STEPS = [1, 5, 10, 20, 100]
+
+
+@pytest.mark.parametrize("num_steps", NUM_STEPS)
+def test_euler_sampler_nfe(num_steps):
+    """
+    For this specific solver the number of steps should be equal to the number of function (estimator) evaluations
+    """
+
+    class IdentityEstimator(torch.nn.Module):
+        def forward(self, input, input_length, condition):
+            return input, input_length
+
+    @dataclass
+    class ForwardCounterHook:
+        counter: int = 0
+
+        def __call__(self, *args, **kwargs):
+            self.counter += 1
+
+    estimator = IdentityEstimator()
+    counter_hook = ForwardCounterHook()
+    estimator.register_forward_hook(counter_hook)
+
+    sampler = ConditionalFlowMatchingEulerSampler(estimator=estimator, num_steps=num_steps)
+
+    b, c, d, l = 2, 3, 4, 5
+    lengths = [5, 3]
+    init_state = torch.randn(b, c, d, l)
+    init_state_length = torch.LongTensor(lengths)
+
+    sampler.forward(state=init_state, estimator_condition=None, state_length=init_state_length)
+
+    assert counter_hook.counter == sampler.num_steps
diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py
index 2a7b1fdfdad6..ee506474ab00 100644
--- a/tests/collections/llm/megatron_mixtral_pretraining.py
+++ b/tests/collections/llm/megatron_mixtral_pretraining.py
@@ -337,7 +337,15 @@ def main(args):
     for key, (shape, dtype, device) in expected_ckpt.items():
         assert key in ckpt, f"Expected {key} to be in ckpt"
         assert isinstance(ckpt[key], torch.Tensor), f"Expected {key} to be a tensor"
-        assert ckpt[key].shape == shape, f"Expected {key} shapes to match {ckpt[key].shape} & {shape}"
+
+        if len(shape) == 1 and key.startswith('optimizer.state'):
+            assert ckpt[key].shape == (
+                1,
+                shape[0],
+            ), f"Expected {key} shapes to match {ckpt[key].shape} & (1, {shape[0]})"
+        else:
+            assert ckpt[key].shape == shape, f"Expected {key} shapes to match {ckpt[key].shape} & {shape}"
+
         assert ckpt[key].dtype == dtype, f"Expected {key} dtype to match {ckpt[key].dtype} & {dtype}"
         assert str(ckpt[key].device) == device, f"Expected {key} device to match {ckpt[key].device} & {device}"
 
diff --git a/tests/collections/speechlm/hf/sft.py b/tests/collections/speechlm/hf/sft.py
new file mode 100755
index 000000000000..41f626f10852
--- /dev/null
+++ b/tests/collections/speechlm/hf/sft.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fiddle as fdl
+import torch
+from lhotse.dataset.collation import collate_matrices, collate_vectors
+from omegaconf import OmegaConf
+
+from nemo import lightning as nl
+from nemo.collections import speechlm
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.speechlm.models import HFAutoModelForSpeechSeq2Seq
+
+torch.set_float32_matmul_precision("medium")
+
+
+class LhotseHfNeMoDataset(torch.utils.data.Dataset):
+    def __init__(self, processor, tokenizer, decoder_mask_fill=-100):
+        super().__init__()
+        self.processor = processor
+        self.tokenizer = tokenizer
+        self.decoder_mask_fill = decoder_mask_fill
+
+    def __getitem__(self, cuts):
+        features = []
+        for cut in cuts:
+            audio = cut.load_audio()
+            features.append(
+                self.processor(
+                    audio,
+                    sampling_rate=cut.sampling_rate,
+                    return_tensors="pt",
+                    text=cut.supervisions[0].text,
+                )
+            )
+
+        input_features = collate_matrices(tensors=[f["input_features"].squeeze(0) for f in features])
+        labels = collate_vectors(tensors=[c.supervisions[0].tokens for c in cuts])
+        decoder_input_ids = labels[:, :-1]
+        decoder_input_ids = decoder_input_ids.masked_fill(
+            decoder_input_ids == self.decoder_mask_fill, self.tokenizer.pad_id
+        )
+        labels = labels[:, 1:].reshape(-1)
+
+        return {
+            "input_features": input_features,
+            "labels": labels,
+            "decoder_input_ids": decoder_input_ids,
+        }
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    # Models can be one of the supported ones by AutoModelForSpeechSeq2Seq such as
+    # openai/whisper-large-v3 and facebook/s2t-small-librispeech-asr
+    parser.add_argument('--model', default='openai/whisper-large-v3')
+    parser.add_argument('--strategy', type=str, default='auto', choices=['auto', 'ddp', 'fsdp'])
+    parser.add_argument('--devices', default=1)
+    parser.add_argument('--accelerator', default='gpu', choices=['gpu'])
+    parser.add_argument('--max-steps', type=int, default=100)
+    parser.add_argument('--model-save-path', type=str, default=None)
+    args = parser.parse_args()
+
+    model = HFAutoModelForSpeechSeq2Seq(model_name=args.model)
+    model = model.to(torch.float)
+    processor = model.processor
+    tokenizer = AutoTokenizer(args.model, include_special_tokens=True)
+
+    config = OmegaConf.create(
+        {
+            "cuts_path": "/home/TestData/speechlm/lhotse/libri/libri-train-5.jsonl.gz",
+            "sample_rate": 16000,
+            "shuffle": True,
+            "num_workers": 2,
+            "batch_size": 4,
+            "shuffle_buffer_size": 100,
+        }
+    )
+
+    train_dataloader = get_lhotse_dataloader_from_config(
+        config,
+        global_rank=0,
+        world_size=1,
+        dataset=LhotseHfNeMoDataset(
+            processor=processor,
+            tokenizer=tokenizer,
+        ),
+        tokenizer=tokenizer,
+    )
+
+    speechlm.api.finetune(
+        model=model,
+        data=train_dataloader,
+        trainer=nl.Trainer(
+            devices=args.devices,
+            max_steps=args.max_steps,
+            accelerator=args.accelerator,
+            strategy=args.strategy,
+            precision="bf16-mixed",
+            log_every_n_steps=1,
+            limit_val_batches=0.0,
+            num_sanity_val_steps=0,
+            accumulate_grad_batches=10,
+            gradient_clip_val=0.5,
+            use_distributed_sampler=False,
+            callbacks=[],
+            logger=None,
+        ),
+        optim=fdl.build(speechlm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
+        log=None,
+    )
+
+    if args.model_save_path is not None:
+        model.save_pretrained(args.model_save_path)