Skip to content

Commit

Permalink
fix: fix seed with multiple ranks (#4479)
Browse files Browse the repository at this point in the history
Fix #4440.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced seed handling to support both single integers and lists for
improved randomness in distributed training.
	- Added logging for neighbor statistics calculation during training.

- **Bug Fixes**
- Improved error handling in data loading processes to ensure
robustness.

- **Documentation**
- Updated documentation for methods related to seed and batch size
management for clarity.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
njzjz and pre-commit-ci[bot] authored Dec 25, 2024
1 parent 30b1447 commit f8605ee
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 8 deletions.
2 changes: 1 addition & 1 deletion deepmd/pd/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def prepare_trainer_input_single(

# validation and training data
# avoid the same batch sequence among devices
rank_seed = (seed + rank) % (2**32) if seed is not None else None
rank_seed = [rank, seed % (2**32)] if seed is not None else None
validation_data_single = (
DpLoaderSet(
validation_systems,
Expand Down
11 changes: 10 additions & 1 deletion deepmd/pd/utils/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
from deepmd.pd.utils.dataset import (
DeepmdDataSetForLoader,
)
from deepmd.pt.utils.utils import (
mix_entropy,
)
from deepmd.utils import random as dp_random
from deepmd.utils.data import (
DataRequirementItem,
)
Expand All @@ -50,8 +54,13 @@


def setup_seed(seed):
paddle.seed(seed)
if isinstance(seed, (list, tuple)):
mixed_seed = mix_entropy(seed)
else:
mixed_seed = seed
paddle.seed(mixed_seed)
os.environ["FLAGS_cudnn_deterministic"] = "True"
dp_random.seed(seed)


class DpLoaderSet(Dataset):
Expand Down
2 changes: 1 addition & 1 deletion deepmd/pt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def prepare_trainer_input_single(

# validation and training data
# avoid the same batch sequence among devices
rank_seed = (seed + rank) % (2**32) if seed is not None else None
rank_seed = [rank, seed % (2**32)] if seed is not None else None
validation_data_single = (
DpLoaderSet(
validation_systems,
Expand Down
11 changes: 9 additions & 2 deletions deepmd/pt/utils/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
from deepmd.pt.utils.dataset import (
DeepmdDataSetForLoader,
)
from deepmd.pt.utils.utils import (
mix_entropy,
)
from deepmd.utils.data import (
DataRequirementItem,
)
Expand All @@ -50,8 +53,12 @@


def setup_seed(seed) -> None:
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if isinstance(seed, (list, tuple)):
mixed_seed = mix_entropy(seed)
else:
mixed_seed = seed
torch.manual_seed(mixed_seed)
torch.cuda.manual_seed_all(mixed_seed)
torch.backends.cudnn.deterministic = True
dp_random.seed(seed)

Expand Down
5 changes: 3 additions & 2 deletions deepmd/tf/entrypoints/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,10 @@ def _do_work(
seed = jdata["training"].get("seed", None)
if seed is not None:
# avoid the same batch sequence among workers
seed += run_opt.my_rank
seed = seed % (2**32)
dp_random.seed(seed)
dp_random.seed([run_opt.my_rank, seed])
else:
dp_random.seed(seed)

# setup data modifier
modifier = get_modifier(jdata["model"].get("modifier", None))
Expand Down
2 changes: 1 addition & 1 deletion deepmd/utils/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def random(size=None):
return _RANDOM_GENERATOR.random_sample(size)


def seed(val: Optional[int] = None) -> None:
def seed(val: Optional[Union[int, list[int]]] = None) -> None:
"""Seed the generator.
Parameters
Expand Down

0 comments on commit f8605ee

Please sign in to comment.