Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sharded distributed sampler for cached dataloading in DDP #195

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ license = { file = "LICENSE" }
authors = [{ name = "CZ Biohub SF", email = "[email protected]" }]
dependencies = [
"iohub==0.1.0",
"torch>=2.1.2",
"torch>=2.4.1",
"timm>=0.9.5",
"tensorboard>=2.13.0",
"lightning>=2.3.0",
Expand Down
56 changes: 56 additions & 0 deletions viscy/data/distributed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Utilities for DDP training."""

from __future__ import annotations

import math
from typing import TYPE_CHECKING

import torch
import torch.distributed
from torch.utils.data.distributed import DistributedSampler

if TYPE_CHECKING:
from torch import Generator


class ShardedDistributedSampler(DistributedSampler):
def _sharded_randperm(self, max_size: int, generator: Generator) -> list[int]:
"""Generate a sharded random permutation of indices.
Overlap may occur in between the last two shards to maintain divisibility."""
sharded_randperm = [
torch.randperm(self.num_samples, generator=generator)
+ min(i * self.num_samples, max_size - self.num_samples)
for i in range(self.num_replicas)
]
indices = torch.stack(sharded_randperm, dim=1).reshape(-1)
return indices.tolist()

def __iter__(self):
"""Modified __iter__ method to shard data across distributed ranks."""
max_size = len(self.dataset) # type: ignore[arg-type]
if self.shuffle:
# deterministically shuffle based on epoch and seed
g = torch.Generator()
g.manual_seed(self.seed + self.epoch)
indices = self._sharded_randperm(max_size, g)
else:
indices = list(range(max_size))
if not self.drop_last:
# add extra samples to make it evenly divisible
padding_size = self.total_size - len(indices)
if padding_size <= len(indices):
indices += indices[:padding_size]
else:
indices += (indices * math.ceil(padding_size / len(indices)))[
:padding_size
]
else:
# remove tail of data to make it evenly divisible.
indices = indices[: self.total_size]
assert len(indices) == self.total_size

# subsample
indices = indices[self.rank : self.total_size : self.num_replicas]
assert len(indices) == self.num_samples

return iter(indices)
Loading
Loading