-
Notifications
You must be signed in to change notification settings - Fork 75
/
Dockerfile
135 lines (112 loc) · 4.76 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
FROM nvidia/cuda:11.0-devel-ubuntu18.04
# TF 2.4 works with CUDA 11.0, not 11.1 - https://github.com/tensorflow/tensorflow/issues/45848
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV HOROVOD_VERSION=0.21.1
ENV TENSORFLOW_PIP=tensorflow
ENV TENSORFLOW_VERSION=2.4.0
ENV TENSORFLOW_ADDONS_VERSION=0.12.0
ENV PYTORCH_VERSION=1.7.1
ENV TORCHVISION_VERSION=0.8.2
# cuDNN version listed here: https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#package-manager-ubuntu-install
ENV CUDNN_VERSION=8.0.5.39-1+cuda11.0
ENV NCCL_VERSION=2.8.3-1+cuda11.0
ARG python=3.7
ENV PYTHON_VERSION=${python}
# LD_LIBRARY_PATH is set incorrectly for legacy compatibility; see https://gitlab.com/nvidia/container-images/cuda/-/issues/47
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs
# Solution to "Couldn't open CUDA library libcuda.so" at https://github.com/tensorflow/tensorflow/issues/4078
# Set default shell to /bin/bash
SHELL ["/bin/bash", "-cu"]
RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
cmake \
g++-4.8 \
git \
curl \
vim \
wget \
ca-certificates \
libcudnn8=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-distutils \
librdmacm1 \
libibverbs1 \
ibverbs-providers
# Install Python
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install TensorFlow, Keras, PyTorch and MXNet
RUN pip install future typing
RUN pip install numpy \
keras \
h5py
RUN pip install ${TENSORFLOW_PIP}==${TENSORFLOW_VERSION}
RUN pip install torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION}
# Install Open MPI
RUN mkdir /tmp/openmpi && \
cd /tmp/openmpi && \
wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && \
tar zxf openmpi-4.0.0.tar.gz && \
cd openmpi-4.0.0 && \
./configure --enable-orterun-prefix-by-default && \
make -j $(nproc) all && \
make install && \
ldconfig && \
rm -rf /tmp/openmpi
# Install Horovod, no CUDA stubs needed because we set LD_LIBRARY_PATH
RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 \
pip install --no-cache-dir horovod==${HOROVOD_VERSION}
# Install OpenSSH for MPI to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd
# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
# Download examples
RUN apt-get install -y --no-install-recommends subversion && \
svn checkout https://github.com/horovod/horovod/trunk/examples && \
rm -rf /examples/.svn
WORKDIR "/examples"
###### Modifications to horovod Dockerfile below
# tensorflow_addons is tightly coupled to TF version. TF 2.1 = 0.9.1, TF 2.2 = 0.10.0
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir \
scikit-learn==0.23.1 \
wandb==0.9.1 \
tensorboard_plugin_profile \
tensorflow-addons==${TENSORFLOW_ADDONS_VERSION} \
colorama==0.4.3 \
pandas \
apache_beam
ENV HDF5_USE_FILE_LOCKING "FALSE"
WORKDIR /fsx
CMD ["/bin/bash"]
###### Modifications specifically for SageMaker are below
# Install SSH on SageMaker machines
RUN apt-get install -y --no-install-recommends openssh-client openssh-server
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
RUN mkdir -p /root/.ssh/ && \
mkdir -p /var/run/sshd && \
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
RUN pip install --no-cache-dir \
mpi4py==3.0.3 \
sagemaker-training==3.7.2
RUN pip install --no-cache-dir \
transformers==4.2.0 \
datasets==1.2.1 \
tokenizers==0.9.4 \
sentencepiece==0.1.95
###### Modifications specifically for EC2 connected to FSx for Lustre are below
# When you use `docker run`, you'll need to run two commands manually:
# pip install -e /fsx/transformers
# These are done in the MPIJob launch script when using Kubernetes, but not for a shell.