Skip to content

Commit

Permalink
fix i periodic integration test and add helper message on torchdata i…
Browse files Browse the repository at this point in the history
…mport failure

ghstack-source-id: 4db9ec111c83f7873253f19f0c95a997800e0f6b
Pull Request resolved: #353
  • Loading branch information
tianyu-l committed May 22, 2024
1 parent e7c31be commit 3fe5423
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 28 deletions.
51 changes: 24 additions & 27 deletions .github/workflows/integration_test_periodic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,27 @@ defaults:
shell: bash -l -eo pipefail {0}

jobs:
unit_tests_4gpu:
runs-on: linux.g5.12xlarge.nvidia.gpu
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install dependencies
run: |
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
pip install --pre torchdata --index-url https://download.pytorch.org/whl/nightly
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
- name: Run test_runner.py
run: python ./test_runner.py
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.g5.12xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: torchtitan-ubuntu-20.04-clang12
repository: pytorch/torchtitan
upload-artifact: outputs
script: |
set -eux
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
pip config --user set global.progress_bar off
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install --pre torchdata --index-url https://download.pytorch.org/whl/nightly/
mkdir artifacts-to-be-uploaded
python ./test_runner.py artifacts-to-be-uploaded
9 changes: 8 additions & 1 deletion torchtitan/datasets/hf_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,14 @@
import torch
from torch.distributed.checkpoint.stateful import Stateful
from torch.utils.data import IterableDataset
from torchdata.stateful_dataloader import StatefulDataLoader

try:
from torchdata.stateful_dataloader import StatefulDataLoader
except ImportError as e:
raise ImportError(
"Please install the latest torchdata nightly to use StatefulDataloader via:"
"pip3 install --pre torchdata --index-url https://download.pytorch.org/whl/nightly"
) from e

from torchtitan.datasets.tokenizer import Tokenizer
from torchtitan.logging_utils import logger
Expand Down

0 comments on commit 3fe5423

Please sign in to comment.