-
Notifications
You must be signed in to change notification settings - Fork 96
/
Copy path0.tensorflow.Dockerfile
88 lines (76 loc) · 2.96 KB
/
0.tensorflow.Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
FROM nvcr.io/nvidia/tensorflow:23.10-tf2-py3
ARG EFA_INSTALLER_VERSION=1.33.0
ARG AWS_OFI_NCCL_VERSION=v1.9.2-aws
ARG NCCL_TESTS_VERSION=v2.13.9
ARG NCCL_VERSION=v2.21.3-1
ARG OPEN_MPI_PATH=/opt/amazon/openmpi
RUN apt-get update -y
RUN apt-get remove -y --allow-change-held-packages \
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 \
libnccl2 libnccl-dev libibnetdisc5 libibmad5 libibumad3
RUN rm -rf /opt/hpcx \
&& rm -rf /usr/local/mpi \
&& rm -rf /usr/local/ucx \
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
&& ldconfig
ENV OPAL_PREFIX=
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
git \
gcc \
vim \
kmod \
openssh-client \
openssh-server \
build-essential \
curl \
autoconf \
libtool \
gdb \
automake \
cmake \
apt-utils \
libhwloc-dev \
aptitude && \
DEBIAN_FRONTEND=noninteractive apt-get autoremove -y
ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:$LD_LIBRARY_PATH
ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
RUN pip install awscli pynvml
#################################################
## Install EFA installer
RUN cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
&& rm -rf $HOME/aws-efa-installer
###################################################
## Install NCCL
RUN cd /tmp \
&& git clone https://github.com/NVIDIA/nccl.git -b ${NCCL_VERSION} \
&& cd nccl \
&& make -j src.build BUILDDIR=/usr/local \
# nvcc to target p5 and p4 instances
NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" \
&& rm -rf /tmp/nccl
# NCCL
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \
echo "/opt/amazon/efa/lib" >> /etc/ld.so.conf.d/efa.conf && \
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
ldconfig
###################################################
## Install AWS-OFI-NCCL plugin
RUN export OPAL_PREFIX="" \
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
&& cd /opt/aws-ofi-nccl \
&& git checkout ${AWS_OFI_NCCL_VERSION} \
&& ./autogen.sh \
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
--with-libfabric=/opt/amazon/efa/ \
--with-cuda=/usr/local/cuda \
--with-nccl=/usr/local/nccl \
--with-mpi=/opt/amazon/openmpi/ \
&& make -j && make install
RUN rm -rf /var/lib/apt/lists/*
RUN echo "hwloc_base_binding_policy = none" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf \
&& echo "rmaps_base_mapping_policy = slot" >> /opt/amazon/openmpi/etc/openmpi-mca-params.conf
COPY src /src