Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{ai}[foss/2023a] DeePDM-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1 TensorFlow 2.15.1 #22219

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from

Conversation

pavelToman
Copy link
Collaborator

@pavelToman pavelToman commented Jan 27, 2025

…M-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb, Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
Copy link

Updated software DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb

Diff against DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1.eb

easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1.eb

diff --git a/easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb
index 11ef5be1e5..06084224a3 100644
--- a/easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1.eb
+++ b/easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb
@@ -2,8 +2,10 @@ easyblock = 'PythonBundle'
 
 name = 'DeePDM-kit'
 version = '3.0.1'
-versionsuffix = '-CUDA-%(cudaver)s'
+local_cuda_suffix = '-CUDA-%(cudaver)s'
 local_tf_version = '2.15.1'
+local_lammps_version = '2Aug2023_update2'
+versionsuffix = local_cuda_suffix + '-with-LAMMPS-plugin'
 
 homepage = 'https://github.com/deepmodeling/deepmd-kit/'
 description = "A deep learning package for many-body potential energy representation and molecular dynamics."
@@ -21,9 +23,9 @@ dependencies = [
     ('CUDA', '12.1.1', '', SYSTEM),
     ('Python', '3.11.3'),
     ('SciPy-bundle', '2023.07'),
-    ('TensorFlow', local_tf_version, versionsuffix),
-    ('jax', '0.4.25', versionsuffix),
-    ('Horovod', '0.28.1', '%s-TensorFlow-%s' % (versionsuffix, local_tf_version)),
+    ('TensorFlow', local_tf_version, local_cuda_suffix),
+    ('jax', '0.4.25', local_cuda_suffix),
+    ('Horovod', '0.28.1', '%s-TensorFlow-%s' % (local_cuda_suffix, local_tf_version)),
     ('ml_dtypes', '0.3.2'),
     ('PyYAML', '6.0'),
     ('h5py', '3.9.0'),
@@ -32,12 +34,20 @@ dependencies = [
     ('SQLAlchemy', '2.0.25'),
     ('ruamel.yaml', '0.17.32'),
     ('typing-extensions', '4.9.0'),
+    ('LAMMPS', local_lammps_version, '-kokkos%s' % local_cuda_suffix),
 ]
 
-local_deepdm_configopts = '-DENABLE_TENSORFLOW=TRUE -DUSE_TF_PYTHON_LIBS=TRUE '
-local_deepdm_configopts += '-DUSE_CUDA_TOOLKIT=ON '
+local_deepdm_configopts = '-DENABLE_TENSORFLOW=TRUE -DUSE_TF_PYTHON_LIBS=TRUE -DUSE_CUDA_TOOLKIT=ON '
+local_deepdm_configopts += '-DLAMMPS_SOURCE_ROOT=%%(builddir)s/lammps-stable_%s ' % local_lammps_version
 
 components = [
+    ('LAMMPS', local_lammps_version, {
+        'easyblock': 'Tarball',
+        'source_urls': ['https://github.com/lammps/lammps/archive/'],
+        'sources': ['stable_%s.tar.gz' % local_lammps_version],
+        'skipsteps': ['install'],
+        'checksums': ['3bcecabc9cad08d0a4e4d989b52d29c58505f7ead8ebacf43c9db8d9fd3d564a'],
+    }),
     ('deepmd', version, {
         'easyblock': 'PythonPackage',
         'source_urls': ['https://pypi.python.org/packages/source/d/deepmd-kit/'],

Updated software DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1.eb

Diff against DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb

easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb

diff --git a/easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb b/easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1.eb
index 06084224a3..11ef5be1e5 100644
--- a/easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1-with-LAMMPS-plugin.eb
+++ b/easybuild/easyconfigs/d/DeePDM-kit/DeePDM-kit-3.0.1-foss-2023a-CUDA-12.1.1.eb
@@ -2,10 +2,8 @@ easyblock = 'PythonBundle'
 
 name = 'DeePDM-kit'
 version = '3.0.1'
-local_cuda_suffix = '-CUDA-%(cudaver)s'
+versionsuffix = '-CUDA-%(cudaver)s'
 local_tf_version = '2.15.1'
-local_lammps_version = '2Aug2023_update2'
-versionsuffix = local_cuda_suffix + '-with-LAMMPS-plugin'
 
 homepage = 'https://github.com/deepmodeling/deepmd-kit/'
 description = "A deep learning package for many-body potential energy representation and molecular dynamics."
@@ -23,9 +21,9 @@ dependencies = [
     ('CUDA', '12.1.1', '', SYSTEM),
     ('Python', '3.11.3'),
     ('SciPy-bundle', '2023.07'),
-    ('TensorFlow', local_tf_version, local_cuda_suffix),
-    ('jax', '0.4.25', local_cuda_suffix),
-    ('Horovod', '0.28.1', '%s-TensorFlow-%s' % (local_cuda_suffix, local_tf_version)),
+    ('TensorFlow', local_tf_version, versionsuffix),
+    ('jax', '0.4.25', versionsuffix),
+    ('Horovod', '0.28.1', '%s-TensorFlow-%s' % (versionsuffix, local_tf_version)),
     ('ml_dtypes', '0.3.2'),
     ('PyYAML', '6.0'),
     ('h5py', '3.9.0'),
@@ -34,20 +32,12 @@ dependencies = [
     ('SQLAlchemy', '2.0.25'),
     ('ruamel.yaml', '0.17.32'),
     ('typing-extensions', '4.9.0'),
-    ('LAMMPS', local_lammps_version, '-kokkos%s' % local_cuda_suffix),
 ]
 
-local_deepdm_configopts = '-DENABLE_TENSORFLOW=TRUE -DUSE_TF_PYTHON_LIBS=TRUE -DUSE_CUDA_TOOLKIT=ON '
-local_deepdm_configopts += '-DLAMMPS_SOURCE_ROOT=%%(builddir)s/lammps-stable_%s ' % local_lammps_version
+local_deepdm_configopts = '-DENABLE_TENSORFLOW=TRUE -DUSE_TF_PYTHON_LIBS=TRUE '
+local_deepdm_configopts += '-DUSE_CUDA_TOOLKIT=ON '
 
 components = [
-    ('LAMMPS', local_lammps_version, {
-        'easyblock': 'Tarball',
-        'source_urls': ['https://github.com/lammps/lammps/archive/'],
-        'sources': ['stable_%s.tar.gz' % local_lammps_version],
-        'skipsteps': ['install'],
-        'checksums': ['3bcecabc9cad08d0a4e4d989b52d29c58505f7ead8ebacf43c9db8d9fd3d564a'],
-    }),
     ('deepmd', version, {
         'easyblock': 'PythonPackage',
         'source_urls': ['https://pypi.python.org/packages/source/d/deepmd-kit/'],

Updated software Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb

Diff against Horovod-0.28.1-foss-2022a-PyTorch-1.12.0.eb

easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-PyTorch-1.12.0.eb

diff --git a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-PyTorch-1.12.0.eb b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
index 089c87eaf0..82290c1cbf 100644
--- a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-PyTorch-1.12.0.eb
+++ b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
@@ -2,36 +2,44 @@ easyblock = 'PythonBundle'
 
 name = 'Horovod'
 version = '0.28.1'
-local_pt_version = '1.12.0'
-versionsuffix = '-PyTorch-%s' % local_pt_version
+local_tf_version = '2.15.1'
+local_cuda_suffix = '-CUDA-%(cudaver)s'
+versionsuffix = local_cuda_suffix + '-TensorFlow-%s' % local_tf_version
 
 homepage = 'https://github.com/uber/horovod'
-description = """Horovod is a distributed training framework for TensorFlow, PyTorch and MXnet.
-This build only has PyTorch enabled."""
+description = "Horovod is a distributed training framework for TensorFlow."
 
-toolchain = {'name': 'foss', 'version': '2022a'}
+toolchain = {'name': 'foss', 'version': '2023a'}
 
 builddependencies = [
-    ('CMake', '3.23.1'),
+    ('CMake', '3.26.3'),
 ]
 dependencies = [
-    ('Python', '3.10.4'),
+    ('Python', '3.11.3'),
     ('PyYAML', '6.0'),
-    ('PyTorch', local_pt_version),
+    ('CUDA', '12.1.1', '', SYSTEM),
+    ('NCCL', '2.18.3', local_cuda_suffix),
+    ('TensorFlow', local_tf_version, local_cuda_suffix),
 ]
 
 use_pip = True
 sanity_pip_check = True
 
-preinstallopts = 'HOROVOD_WITH_MPI=1 '
-preinstallopts += 'HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
+local_preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && '
+local_preinstallopts += 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
+local_preinstallopts += 'HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
 
 exts_list = [
     ('cloudpickle', '2.2.1', {
         'checksums': ['d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5'],
     }),
     ('horovod', version, {
-        'checksums': ['92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0'],
+        'patches': ['Horovod-0.28.1_support_flatbuffers_2.0.6.patch'],
+        'preinstallopts': local_preinstallopts,
+        'checksums': [
+            '92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0',
+            '9696ffb3b2bad1d6dd5a9f37bc58078ca7c585f933bcbec037036ad9fc0b297d',
+        ],
     }),
 ]
 
Diff against Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.13.1.eb

easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.13.1.eb

diff --git a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.13.1.eb b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
index 4473bc2e53..82290c1cbf 100644
--- a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2022a-CUDA-11.7.0-PyTorch-1.13.1.eb
+++ b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
@@ -2,39 +2,44 @@ easyblock = 'PythonBundle'
 
 name = 'Horovod'
 version = '0.28.1'
-local_pt_version = '1.13.1'
+local_tf_version = '2.15.1'
 local_cuda_suffix = '-CUDA-%(cudaver)s'
-versionsuffix = local_cuda_suffix + '-PyTorch-%s' % local_pt_version
+versionsuffix = local_cuda_suffix + '-TensorFlow-%s' % local_tf_version
 
 homepage = 'https://github.com/uber/horovod'
-description = """Horovod is a distributed training framework for TensorFlow, PyTorch and MXnet.
-This build only has PyTorch enabled."""
+description = "Horovod is a distributed training framework for TensorFlow."
 
-toolchain = {'name': 'foss', 'version': '2022a'}
+toolchain = {'name': 'foss', 'version': '2023a'}
 
 builddependencies = [
-    ('CMake', '3.23.1'),
+    ('CMake', '3.26.3'),
 ]
 dependencies = [
-    ('Python', '3.10.4'),
+    ('Python', '3.11.3'),
     ('PyYAML', '6.0'),
-    ('CUDA', '11.7.0', '', SYSTEM),
-    ('NCCL', '2.12.12', local_cuda_suffix),
-    ('PyTorch', local_pt_version, local_cuda_suffix),
+    ('CUDA', '12.1.1', '', SYSTEM),
+    ('NCCL', '2.18.3', local_cuda_suffix),
+    ('TensorFlow', local_tf_version, local_cuda_suffix),
 ]
 
 use_pip = True
 sanity_pip_check = True
 
-preinstallopts = 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
-preinstallopts += 'HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
+local_preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && '
+local_preinstallopts += 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
+local_preinstallopts += 'HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
 
 exts_list = [
     ('cloudpickle', '2.2.1', {
         'checksums': ['d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5'],
     }),
     ('horovod', version, {
-        'checksums': ['92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0'],
+        'patches': ['Horovod-0.28.1_support_flatbuffers_2.0.6.patch'],
+        'preinstallopts': local_preinstallopts,
+        'checksums': [
+            '92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0',
+            '9696ffb3b2bad1d6dd5a9f37bc58078ca7c585f933bcbec037036ad9fc0b297d',
+        ],
     }),
 ]
 
Diff against Horovod-0.28.1-fosscuda-2020b-PyTorch-1.9.0.eb

easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-fosscuda-2020b-PyTorch-1.9.0.eb

diff --git a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-fosscuda-2020b-PyTorch-1.9.0.eb b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
index 3d2eb91b26..82290c1cbf 100644
--- a/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-fosscuda-2020b-PyTorch-1.9.0.eb
+++ b/easybuild/easyconfigs/h/Horovod/Horovod-0.28.1-foss-2023a-CUDA-12.1.1-TensorFlow-2.15.1.eb
@@ -2,36 +2,44 @@ easyblock = 'PythonBundle'
 
 name = 'Horovod'
 version = '0.28.1'
-local_pt_version = '1.9.0'
-versionsuffix = '-PyTorch-%s' % local_pt_version
+local_tf_version = '2.15.1'
+local_cuda_suffix = '-CUDA-%(cudaver)s'
+versionsuffix = local_cuda_suffix + '-TensorFlow-%s' % local_tf_version
 
 homepage = 'https://github.com/uber/horovod'
 description = "Horovod is a distributed training framework for TensorFlow."
 
-toolchain = {'name': 'fosscuda', 'version': '2020b'}
+toolchain = {'name': 'foss', 'version': '2023a'}
 
 builddependencies = [
-    ('CMake', '3.18.4'),
+    ('CMake', '3.26.3'),
 ]
 dependencies = [
-    ('Python', '3.8.6'),
-    ('PyYAML', '5.3.1'),
-    ('NCCL', '2.8.3', '-CUDA-%(cudaver)s'),
-    ('PyTorch', local_pt_version),
+    ('Python', '3.11.3'),
+    ('PyYAML', '6.0'),
+    ('CUDA', '12.1.1', '', SYSTEM),
+    ('NCCL', '2.18.3', local_cuda_suffix),
+    ('TensorFlow', local_tf_version, local_cuda_suffix),
 ]
 
 use_pip = True
 sanity_pip_check = True
 
-preinstallopts = 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
-preinstallopts += 'HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
+local_preinstallopts = 'module swap protobuf/3.21.9-GCCcore-12.3.0 && '
+local_preinstallopts += 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
+local_preinstallopts += 'HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
 
 exts_list = [
     ('cloudpickle', '2.2.1', {
         'checksums': ['d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5'],
     }),
     ('horovod', version, {
-        'checksums': ['92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0'],
+        'patches': ['Horovod-0.28.1_support_flatbuffers_2.0.6.patch'],
+        'preinstallopts': local_preinstallopts,
+        'checksums': [
+            '92a43f5a94c43907a56805bad15f19700c62ffc83b7ca483f9e104e229f67ef0',
+            '9696ffb3b2bad1d6dd5a9f37bc58078ca7c585f933bcbec037036ad9fc0b297d',
+        ],
     }),
 ]
 
@@ -40,4 +48,6 @@ sanity_check_paths = {
     'dirs': ['lib/python%(pyshortver)s/site-packages'],
 }
 
+sanity_check_commands = ["horovodrun --help"]
+
 moduleclass = 'tools'

@pavelToman pavelToman changed the title {ai}[foss/2023a] DeePDM-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1, CUDA 12.1.1 TensorFlow 2.15.1, ... {ai}[foss/2023a] DeePDM-kit v3.0.1, Horovod v0.28.1 w/ CUDA 12.1.1 TensorFlow 2.15.1 Jan 27, 2025
@pavelToman pavelToman added new and removed update labels Jan 27, 2025
@pavelToman
Copy link
Collaborator Author

Test report by @pavelToman
SUCCESS
Build succeeded for 3 out of 3 (3 easyconfigs in total)
node4012.donphan.os - Linux RHEL 8.8, x86_64, Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz, 1 x NVIDIA NVIDIA A2, 545.23.08, Python 3.6.8
See https://gist.github.com/pavelToman/17efd63f20d8113a8751102975af2595 for a full test report.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

DeePMD-kit
1 participant