Skip to content

Commit

Permalink
Dask container recipe (#545)
Browse files Browse the repository at this point in the history
  • Loading branch information
cgoina authored Nov 16, 2023
1 parent 032ce1b commit d7c6b97
Show file tree
Hide file tree
Showing 10 changed files with 524 additions and 0 deletions.
56 changes: 56 additions & 0 deletions dask/2023.10.1-py11-ol9/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
ARG DASK_VERSION=2023.10.1

FROM oraclelinux:9

MAINTAINER biocontainers <[email protected]>

LABEL base_image="oraclelinux:9"
LABEL version="1"
LABEL software="Dask"
LABEL software.version="2023.10.1-py11-ol9"
LABEL about.summary="Dask is a flexible library for parallel computing in Python."
LABEL about.home="https://www.dask.org"
LABEL about.license="BSD-3-Clause"
LABEL about.license_file="https://github.com/dask/dask/blob/main/LICENSE.txt"
LABEL about.documentation="https://docs.dask.org/en/stable/"
LABEL extra.binaries="/opt/mambaforge/bin"
LABEL extra.scripts="/opt/scripts/daskscripts"
LABEL about.tags="implemented-in::python, interface::daemon, role::devel-lib"
LABEL conda_forge.minforge.version="23.3.1-1"
LABEL python.version="3.11"

ARG TARGETPLATFORM

ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=all

RUN dnf update -y && \
dnf install -y \
tar wget \
hostname \
procps-ng \
net-tools \
which

# Install miniconda
RUN wget https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Mambaforge-23.3.1-1-Linux-$(uname -m).sh \
-O mamba-install.sh && \
bash mamba-install.sh -b -p /opt/mambaforge

ENV PATH=/opt/mambaforge/bin:${PATH}

COPY conda-env.yml /tmp/

RUN mamba env update -n base -f /tmp/conda-env.yml

RUN rm -rf /opt/mambaforge/pkgs

WORKDIR /opt/scripts/daskscripts

ENV DASK_CONFIG=/opt/scripts/daskscripts/config/dask-config.yml

# Add scripts
COPY scripts/* /opt/scripts/daskscripts/
COPY config /opt/scripts/daskscripts/config

RUN chmod 755 /opt/scripts/daskscripts/*.sh
28 changes: 28 additions & 0 deletions dask/2023.10.1-py11-ol9/conda-env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
channels:
- conda-forge
- defaults
dependencies:
- python=3.11
- pip
- bokeh>=2.4.2,<3
- cachey=0.2.1
- cytoolz>=0.12
- dask=2023.10.1
- distributed=2023.10.1
- h5py
- jq=1.6
- jupyter-server-proxy
- lz4>=4.3.2
- msgpack-python
- nomkl=3.0
- numpy>=1.26.0
- pandas=1.5.1
- psutil>=5.7.2
- python-blosc=1.10.6
- s3fs>=2021.9.0
- scipy>=1.9.1
- streamz=0.6.3
- tblib>=1.6.0
- zarr=2.16.1
- pip:
- cloudpickle==3.0.0
48 changes: 48 additions & 0 deletions dask/2023.10.1-py11-ol9/config/dask-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
distributed:
admin:
tick:
interval: 500ms # time between event loop health checks
limit: 2h # time allowed between triggering a warning

comm:
retry:
count: 10

timeouts:
connect: 600s

distributed:
nanny:
pre-spawn-environ:
MALLOC_TRIM_THRESHOLD_: 0

scheduler:
allowed-failures: 5 # default 3
worker-ttl: 120s # default 5min
unknown-task-duration: 120s

locks:
lease-timeout: 900s
lease-validation-interval: 300s

worker:
use-file-locking: False

lifetime:
stagger: '5 minutes'

memory:
transfer: 0.5 # default 0.1
target: 0.9 # default 0.6
spill: 0.9 # default 0.7
pause: false # default 0.8
terminate: false # default 0.95
recent-to-old-time: 300s
monitor-interval: 500ms # default 100ms

rebalance:
sender-min: 0.5
recipient-max: 0.5

profile:
interval: 50ms # default 10ms
30 changes: 30 additions & 0 deletions dask/2023.10.1-py11-ol9/scripts/determine_ip.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
# Determine the IP address of the current host

container_engine=$1

local_ip=
if [ "$container_engine" = "docker" ]; then
for interface in /sys/class/net/{eth*,en*,em*}; do
[ -e $interface ] && \
[ `cat $interface/operstate` == "up" ] && \
local_ip=$(ifconfig `basename $interface` | grep "inet " | awk '$1=="inet" {print $2; exit}' | sed s/addr://g)
if [[ "$local_ip" != "" ]]; then
echo "Use IP: $local_ip"
break
fi
done
if [[ -z "${local_ip}" ]] ; then
echo "Could not determine local IP: local_ip is empty"
exit 1
fi
else
# Take the last IP that's listed by hostname -i.
# This hack works on Janelia Cluster and AWS EC2.
local_ip=`hostname -i | rev | cut -d' ' -f1 | rev`
echo "Use Spark IP: $local_ip"
if [[ -z "${local_ip}" ]] ; then
echo "Could not determine local IP: local_ip is empty"
exit 1
fi
fi
12 changes: 12 additions & 0 deletions dask/2023.10.1-py11-ol9/scripts/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash -ue
# Prepare the given dir by creating it, or cleaning it
# of Dask content if it already exists.
work_dir=$1

if [[ ! -d "${work_dir}" ]] ; then
echo "Creating work directory: ${work_dir}"
mkdir -p "${work_dir}"
else
echo "Cleaning existing work directory: ${work_dir}"
rm -rf ${work_dir}/* || true
fi
81 changes: 81 additions & 0 deletions dask/2023.10.1-py11-ol9/scripts/startmanager.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash -ue
# Start the Dask scheduler process and wait for terminate signal
DIR=$(cd "$(dirname "$0")"; pwd)

container_engine=
scheduler_pid_file=
terminate_file=
scheduler_start_timeout=
scheduler_poll_interval=
scheduler_work_dir=.
args=()

while [[ $# > 0 ]]; do
key="$1"
shift # past the key
case $key in
--container-engine)
container_engine="$1"
shift
;;
--pid-file)
scheduler_pid_file=$1
args=("${args[@]}" "--pid-file" "${scheduler_pid_file}")
shift
;;
--scheduler-start-timeout)
scheduler_start_timeout=$1
shift
;;
--scheduler-poll-interval)
scheduler_poll_interval=$1
shift
;;
--scheduler-work-dir)
scheduler_work_dir=$1
shift
;;
--terminate-file)
terminate_file=$1
shift
;;
*)
args=("${args[@]}" "${key}")
;;
esac
done

function cleanup() {
echo "Killing scheduler background processes"
if [[ -f "${scheduler_pid_file}" ]]; then
local dpid=$(cat "${scheduler_pid_file}")
kill "$dpid"
fi
exit 0
}
trap cleanup INT TERM EXIT

echo "Determining scheduler IP address..."
. $DIR/determine_ip.sh $container_engine

# start scheduler in background
echo "Run: dask scheduler --host ${local_ip} ${args[@]}"
dask scheduler --host ${local_ip} ${args[@]} \
2> >(tee ${scheduler_work_dir}/dask-scheduler.log >&2) \
&

# wait for PID file
# make sure there is a timeout param since the default wait does not timeout
${DIR}/waitforanyfile.sh 0 "${terminate_file},${scheduler_pid_file}" ${scheduler_start_timeout}

if [[ -e "${scheduler_pid_file}" ]] ; then
scheduler_pid=$(cat "${scheduler_pid_file}")
echo "Scheduler started: pid=$scheduler_pid"

# scheduler was started - wait until terminate
echo "Wait for termination event: ${terminate_file}"
${DIR}/waitforanyfile.sh ${scheduler_pid} "${terminate_file}"
else
echo "Scheduler pid file not found"
scheduler_pid=0
fi
104 changes: 104 additions & 0 deletions dask/2023.10.1-py11-ol9/scripts/startworker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/bin/bash -ue
# Start the Dask worker process and wait for terminate signal
DIR=$(cd "$(dirname "$0")"; pwd)

container_engine=
worker_name=
worker_dir=
worker_pid_file=
terminate_file=
worker_start_timeout=
worker_poll_interval=
scheduler_address=
args=()

while [[ $# > 0 ]]; do
key="$1"
shift # past the key
case $key in
--container-engine)
container_engine="$1"
shift
;;
--pid-file)
worker_pid_file=$1
args=("${args[@]}" "--pid-file" "${worker_pid_file}")
shift
;;
--name)
worker_name=$1
args=("${args[@]}" "--name" "${worker_name}")
shift
;;
--worker-dir)
worker_dir=$1
shift
;;
--scheduler-address)
scheduler_address=$1
shift
;;
--worker-start-timeout)
worker_start_timeout=$1
shift
;;
--worker-poll-interval)
worker_poll_interval=$1
shift
;;
--terminate-file)
terminate_file=$1
shift
;;
*)
args=("${args[@]}" "${key}")
;;
esac
done

function cleanup() {
echo "Killing background processes for ${worker_name}"
if [[ -f "${worker_pid_file}" ]]; then
local wpid=$(cat "${worker_pid_file}")
kill "$wpid"
fi
exit 0
}
trap cleanup INT TERM EXIT

echo "Determining worker ${worker_name} IP address..."
. $DIR/determine_ip.sh $container_engine

if [[ -n ${worker_dir} ]] ; then
mkdir -p ${worker_dir}
fi

# start worker in background
echo "Run: dask worker \
--host ${local_ip} \
--local-directory ${worker_dir} \
${args[@]} \
${scheduler_address}"
dask worker \
--host ${local_ip} \
--local-directory ${worker_dir} \
${args[@]} \
${scheduler_address} \
2> >(tee ${worker_dir}/${worker_name}.log >&2) \
&

# wait for PID file
# make sure there is a timeout param since the default wait does not timeout
${DIR}/waitforanyfile.sh 0 "${terminate_file},${worker_pid_file}" ${worker_start_timeout}

if [[ -e "${worker_pid_file}" ]] ; then
worker_pid=$(cat "${worker_pid_file}")
echo "Worker ${worker_name} started: pid=$worker_pid"

# worker was started - wait until terminate
echo "Worker ${worker_name} - wait for termination event: ${terminate_file}"
${DIR}/waitforanyfile.sh ${worker_pid} "${terminate_file}"
else
echo "Worker ${worker_name} pid file not found"
worker_pid=0
fi
Loading

0 comments on commit d7c6b97

Please sign in to comment.