Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dask container recipe #545

Merged
merged 22 commits into from
Nov 16, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
dask recipes
  • Loading branch information
Cristian Goina committed Nov 14, 2023
commit 42155a49c28a25470d281c15e76899987bd57279
46 changes: 46 additions & 0 deletions dask/2023.10.1-ol9/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
ARG DASK_VERSION=2023.10.1

FROM oraclelinux:9

LABEL base_image="oraclelinux:9"
LABEL software="Dask"
LABEL software.version="2023.10.1"
LABEL about.summary="Dask is a flexible library for parallel computing in Python."
LABEL about.home="https://www.dask.org"
LABEL about.license="BSD-3-Clause"
LABEL about.license_file="https://github.com/dask/dask/blob/main/LICENSE.txt"
LABEL about.documentation="https://docs.dask.org/en/stable/"

ARG TARGETPLATFORM

ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=all

RUN dnf update -y && \
dnf install -y \
tar wget \
hostname \
procps-ng \
net-tools \
which

# Install miniconda
RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-$(uname -m).sh \
-O mamba-install.sh && \
bash mamba-install.sh -b -p /opt/mambaforge

ENV PATH=/opt/mambaforge/bin:${PATH}

COPY conda-env.yml /tmp/

RUN mamba env update -n base -f /tmp/conda-env.yml

RUN rm -rf /opt/mambaforge/pkgs

WORKDIR /opt/scripts/daskscripts

ENV DASK_CONFIG=/opt/scripts/daskscripts/config/dask-config.yml

# Add scripts
COPY --chmod=755 scripts/* /opt/scripts/daskscripts
COPY config /opt/scripts/daskscripts/config
28 changes: 28 additions & 0 deletions dask/2023.10.1-ol9/conda-env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
channels:
- conda-forge
- defaults
dependencies:
- python=3.11
- pip
- bokeh>=2.4.2,<3
- cachey=0.2.1
- cytoolz>=0.12
- dask=2023.10.1
- distributed=2023.10.1
- h5py
- jq=1.6
- jupyter-server-proxy
- lz4>=4.3.2
- msgpack-python
- nomkl=3.0
- numpy>=1.26.0
- pandas=1.5.1
- psutil>=5.7.2
- python-blosc=1.10.6
- s3fs>=2021.9.0
- scipy>=1.9.1
- streamz=0.6.3
- tblib>=1.6.0
- zarr=2.16.1
- pip:
- cloudpickle==3.0.0
48 changes: 48 additions & 0 deletions dask/2023.10.1-ol9/config/dask-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
distributed:
admin:
tick:
interval: 500ms # time between event loop health checks
limit: 2h # time allowed between triggering a warning

comm:
retry:
count: 10

timeouts:
connect: 600s

distributed:
nanny:
pre-spawn-environ:
MALLOC_TRIM_THRESHOLD_: 0

scheduler:
allowed-failures: 5 # default 3
worker-ttl: 120s # default 5min
unknown-task-duration: 120s

locks:
lease-timeout: 900s
lease-validation-interval: 300s

worker:
use-file-locking: False

lifetime:
stagger: '5 minutes'

memory:
transfer: 0.5 # default 0.1
target: 0.9 # default 0.6
spill: 0.9 # default 0.7
pause: false # default 0.8
terminate: false # default 0.95
recent-to-old-time: 300s
monitor-interval: 500ms # default 100ms

rebalance:
sender-min: 0.5
recipient-max: 0.5

profile:
interval: 50ms # default 10ms
30 changes: 30 additions & 0 deletions dask/2023.10.1-ol9/scripts/determine_ip.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
# Determine the IP address of the current host

container_engine=$1

local_ip=
if [ "$container_engine" = "docker" ]; then
for interface in /sys/class/net/{eth*,en*,em*}; do
[ -e $interface ] && \
[ `cat $interface/operstate` == "up" ] && \
local_ip=$(ifconfig `basename $interface` | grep "inet " | awk '$1=="inet" {print $2; exit}' | sed s/addr://g)
if [[ "$local_ip" != "" ]]; then
echo "Use IP: $local_ip"
break
fi
done
if [[ -z "${local_ip}" ]] ; then
echo "Could not determine local IP: local_ip is empty"
exit 1
fi
else
# Take the last IP that's listed by hostname -i.
# This hack works on Janelia Cluster and AWS EC2.
local_ip=`hostname -i | rev | cut -d' ' -f1 | rev`
echo "Use Spark IP: $local_ip"
if [[ -z "${local_ip}" ]] ; then
echo "Could not determine local IP: local_ip is empty"
exit 1
fi
fi
12 changes: 12 additions & 0 deletions dask/2023.10.1-ol9/scripts/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash -ue
# Prepare the given dir by creating it, or cleaning it
# of Dask content if it already exists.
work_dir=$1

if [[ ! -d "${work_dir}" ]] ; then
echo "Creating work directory: ${work_dir}"
mkdir -p "${work_dir}"
else
echo "Cleaning existing work directory: ${work_dir}"
rm -rf ${work_dir}/* || true
fi
81 changes: 81 additions & 0 deletions dask/2023.10.1-ol9/scripts/startmanager.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash -ue
# Start the Dask scheduler process and wait for terminate signal
DIR=$(cd "$(dirname "$0")"; pwd)

container_engine=
scheduler_pid_file=
terminate_file=
scheduler_start_timeout=
scheduler_poll_interval=
scheduler_work_dir=.
args=()

while [[ $# > 0 ]]; do
key="$1"
shift # past the key
case $key in
--container-engine)
container_engine="$1"
shift
;;
--pid-file)
scheduler_pid_file=$1
args=("${args[@]}" "--pid-file" "${scheduler_pid_file}")
shift
;;
--scheduler-start-timeout)
scheduler_start_timeout=$1
shift
;;
--scheduler-poll-interval)
scheduler_poll_interval=$1
shift
;;
--scheduler-work-dir)
scheduler_work_dir=$1
shift
;;
--terminate-file)
terminate_file=$1
shift
;;
*)
args=("${args[@]}" "${key}")
;;
esac
done

function cleanup() {
echo "Killing scheduler background processes"
if [[ -f "${scheduler_pid_file}" ]]; then
local dpid=$(cat "${scheduler_pid_file}")
kill "$dpid"
fi
exit 0
}
trap cleanup INT TERM EXIT

echo "Determining scheduler IP address..."
. $DIR/determine_ip.sh $container_engine

# start scheduler in background
echo "Run: dask scheduler --host ${local_ip} ${args[@]}"
dask scheduler --host ${local_ip} ${args[@]} \
2> >(tee ${scheduler_work_dir}/dask-scheduler.log >&2) \
&

# wait for PID file
# make sure there is a timeout param since the default wait does not timeout
${DIR}/waitforanyfile.sh 0 "${terminate_file},${scheduler_pid_file}" ${scheduler_start_timeout}

if [[ -e "${scheduler_pid_file}" ]] ; then
scheduler_pid=$(cat "${scheduler_pid_file}")
echo "Scheduler started: pid=$scheduler_pid"

# scheduler was started - wait until terminate
echo "Wait for termination event: ${terminate_file}"
${DIR}/waitforanyfile.sh ${scheduler_pid} "${terminate_file}"
else
echo "Scheduler pid file not found"
scheduler_pid=0
fi
104 changes: 104 additions & 0 deletions dask/2023.10.1-ol9/scripts/startworker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/bin/bash -ue
# Start the Dask worker process and wait for terminate signal
DIR=$(cd "$(dirname "$0")"; pwd)

container_engine=
worker_name=
worker_dir=
worker_pid_file=
terminate_file=
worker_start_timeout=
worker_poll_interval=
scheduler_address=
args=()

while [[ $# > 0 ]]; do
key="$1"
shift # past the key
case $key in
--container-engine)
container_engine="$1"
shift
;;
--pid-file)
worker_pid_file=$1
args=("${args[@]}" "--pid-file" "${worker_pid_file}")
shift
;;
--name)
worker_name=$1
args=("${args[@]}" "--name" "${worker_name}")
shift
;;
--worker-dir)
worker_dir=$1
shift
;;
--scheduler-address)
scheduler_address=$1
shift
;;
--worker-start-timeout)
worker_start_timeout=$1
shift
;;
--worker-poll-interval)
worker_poll_interval=$1
shift
;;
--terminate-file)
terminate_file=$1
shift
;;
*)
args=("${args[@]}" "${key}")
;;
esac
done

function cleanup() {
echo "Killing background processes for ${worker_name}"
if [[ -f "${worker_pid_file}" ]]; then
local wpid=$(cat "${worker_pid_file}")
kill "$wpid"
fi
exit 0
}
trap cleanup INT TERM EXIT

echo "Determining worker ${worker_name} IP address..."
. $DIR/determine_ip.sh $container_engine

if [[ -n ${worker_dir} ]] ; then
mkdir -p ${worker_dir}
fi

# start worker in background
echo "Run: dask worker \
--host ${local_ip} \
--local-directory ${worker_dir} \
${args[@]} \
${scheduler_address}"
dask worker \
--host ${local_ip} \
--local-directory ${worker_dir} \
${args[@]} \
${scheduler_address} \
2> >(tee ${worker_dir}/${worker_name}.log >&2) \
&

# wait for PID file
# make sure there is a timeout param since the default wait does not timeout
${DIR}/waitforanyfile.sh 0 "${terminate_file},${worker_pid_file}" ${worker_start_timeout}

if [[ -e "${worker_pid_file}" ]] ; then
worker_pid=$(cat "${worker_pid_file}")
echo "Worker ${worker_name} started: pid=$worker_pid"

# worker was started - wait until terminate
echo "Worker ${worker_name} - wait for termination event: ${terminate_file}"
${DIR}/waitforanyfile.sh ${worker_pid} "${terminate_file}"
else
echo "Worker ${worker_name} pid file not found"
worker_pid=0
fi
Loading