Skip to content

Commit

Permalink
add files
Browse files Browse the repository at this point in the history
  • Loading branch information
simonpintarelli committed Aug 9, 2024
1 parent 0b6512a commit f5516cc
Show file tree
Hide file tree
Showing 6 changed files with 390 additions and 0 deletions.
14 changes: 14 additions & 0 deletions ci/cscs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

stages:
- baseimage

build aarch64:
extends: .container-builder-cscs-gh200
stage: baseimage
timeout: 10h
variables:
DOCKERFILE: ci/slurm_docker/Dockerfile
WATCH_FILECHANGES: ci/slurm_docker/Dockerfile ci/slurm_docker/cgroup.conf ci/slurm_docker/entrypoint.sh ci/slurm_docker/install_slurm.sh ci/slurm_docker/slurm.conf.in
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/slurm-uenv-mount
64 changes: 64 additions & 0 deletions ci/slurm_docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
FROM opensuse/leap:15.4

ARG SLURM_VERSION=23.02.7
ARG SLURM_ROOT=/usr
ARG SLURM_CONFDIR=/etc/slurm

ENV SLURM_VERSION ${SLURM_VERSION}
ENV SLURM_ROOT ${SLURM_ROOT}
ENV SLURM_CONFDIR ${SLURM_CONFDIR}


RUN zypper install -y \
munge \
munge-devel \
libnuma1 \
libnuma-devel \
librrd8 \
readline-devel \
hwloc \
hwloc-devel \
lz4 \
liblz4-devel \
libz1 \
zlib-devel \
freeipmi \
freeipmi-devel \
dbus-1 \
dbus-1-devel \
make \
gcc \
gcc-c++ \
curl \
tar \
bzip2 \
python3 \
vim \
ca-certificates \
less \
sudo

RUN zypper install -y \
lua53 \
lua53-devel \
libmount-devel

RUN useradd -M slurm

RUN mkdir -p /var/log/slurm
RUN mkdir -p /var/spool/slurmctld && chown slurm /var/spool/slurmctld && chmod u+rwx /var/spool/slurmctld
RUN mkdir -p /var/spool/slurmd && chown slurm /var/spool/slurmd && chmod u+rwx /var/spool/slurmd


COPY install_slurm.sh .

RUN ./install_slurm.sh ${SLURM_VERSION} ${SLURM_ROOT} ${SLURM_CONFDIR} --enable-multiple-slurmd

RUN mkdir -p ${SLURM_CONFDIR}
COPY cgroup.conf ${SLURM_CONFDIR}
COPY slurm.conf.in ${SLURM_CONFDIR}

COPY entrypoint.sh .
ENTRYPOINT ["./entrypoint.sh"]
CMD ["bash"]

5 changes: 5 additions & 0 deletions ci/slurm_docker/cgroup.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CgroupAutomount=yes
ConstrainCores=no
ConstrainRAMSpace=no
CgroupMountpoint=/sys/fs/cgroup
CgroupPlugin=cgroup/v1
86 changes: 86 additions & 0 deletions ci/slurm_docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash

dbus-launch
sudo -u munge munged

: "${SLURM_CONF_IN=$SLURM_CONFDIR/slurm.conf.in}"
: "${SLURM_CONF=$SLURM_CONFDIR/slurm.conf}"

# Default number of slurm nodes
: "${SLURM_NUMNODES=3}"

# Default slurm controller
: "${SLURMCTLD_HOST=$HOSTNAME}"
: "${SLURMCTLD_ADDR=127.0.0.1}"

# Default node info
: "${NODE_HOST=$HOSTNAME}"
: "${NODE_ADDR=127.0.0.1}"
: "${NODE_BASEPORT=6001}"

# Default hardware profile
: "${NODE_HW=CPUs=4}"

# Generate node names and associated ports
NODE_NAMES=$(printf "nd[%05i-%05i]" 1 $SLURM_NUMNODES)
NODE_PORTS=$(printf "%i-%i" $NODE_BASEPORT $(($NODE_BASEPORT+$SLURM_NUMNODES-1)))


echo "INFO:"
echo "INFO: Creating $SLURM_CONF with"
echo "INFO: "
column -t <<-EOF
INFO: SLURMCTLD_HOST=$SLURMCTLD_HOST SLURMCTLD_ADDR=$SLURMCTLD_ADDR
INFO: NODE_HOST=$NODE_HOST NODE_ADDR=$NODE_ADDR NODE_BASEPORT=$NODE_BASEPORT
INFO: NODE_HW=$NODE_HW
INFO: SLURM_NUMNODES=$SLURM_NUMNODES
EOF
echo "INFO: "
echo "INFO: Derived values:"
echo "INFO:"
column -t <<-EOF
INFO: NODE_NAMES=$NODE_NAMES
INFO: NODE_PORTS=$NODE_PORTS
EOF
echo "INFO:"
echo "INFO: Override any of the non-derived values by setting the respective environment variable"
echo "INFO: when starting Docker."
echo "INFO:"

export PATH=$SLURM_ROOT/bin:$PATH
export LD_LIBRARY_PATH=$SLURM_ROOT/lib:$LD_LIBRARY_PATH
export MANPATH=$SLURM_ROOT/man:$MANPATH

(
echo "NodeName=${NODE_NAMES} NodeHostname=${NODE_HOST} NodeAddr=${NODE_ADDR} Port=${NODE_PORTS} State=UNKNOWN ${NODE_HW}"
echo "PartitionName=dkr Nodes=ALL Default=YES MaxTime=INFINITE State=UP"
) \
| sed -e "s/SLURMCTLDHOST/${SLURMCTLD_HOST}/" \
-e "s/SLURMCTLDADDR/${SLURMCTLD_ADDR}/" \
$SLURM_CONF_IN - \
> $SLURM_CONF

NODE_NAME_LIST=$(scontrol show hostnames $NODE_NAMES)

for n in $NODE_NAME_LIST
do
echo "$NODE_ADDR $n" >> /etc/hosts
done

echo
echo "Starting Slurm services..."
echo

$SLURM_ROOT/sbin/slurmctld

for n in $NODE_NAME_LIST
do
$SLURM_ROOT/sbin/slurmd -N $n
done

echo
sinfo
echo
echo

exec "$@"
66 changes: 66 additions & 0 deletions ci/slurm_docker/install_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash -x
#
# Usage: install_slurm.sh <slurm-version> <install-prefix> [configure-args]
#

SLURM_VERSION=$1
SLURM_ROOT=$2
SLURM_CONFDIR=$3
shift; shift; shift
ARGS=$*

slurm_tar_file=slurm-${SLURM_VERSION}.tar.bz2
slurm_url=https://download.schedmd.com/slurm/${slurm_tar_file}


if [ -z "$SLURM_VERSION" -o -z "$SLURM_ROOT" -o -z "$SLURM_CONFDIR" ];
then
echo "Usage: install_slurm.sh <slurm-version> <install-prefix> <sysconf-dir> [configure-args]"
echo "No Slurm version or install-prefix specified on command line. Aborting."
exit 1
fi

#
# Download slurm tarball and unpack it
#
if true; then

mkdir -p /opt/src || exit 1
(
cd /opt/src

if ! stat $slurm_tar_file; then
echo "=== downloading slurm ${SLURM_VERSION} from ${slurm_url}"
curl --fail --output ${slurm_tar_file} ${slurm_url} || exit 1
fi

echo "=== unpacking $slurm_tar_file"
tar -xjf ${slurm_tar_file} || exit 1
)

fi

if [ "$ARGS" = "NO_BUILD" ];
then
exit 0
fi

#
# Remove any old build directory.
# Run configure, make, make install
#

stat /opt/build/slurm-${SLURM_VERSION} && rm -rf /opt/build/slurm-${SLURM_VERSION}
mkdir -p /opt/build/slurm-${SLURM_VERSION} || exit 1
(
cd /opt/build/slurm-${SLURM_VERSION}
/opt/src/slurm-${SLURM_VERSION}/configure --help
/opt/src/slurm-${SLURM_VERSION}/configure \
--prefix=${SLURM_ROOT} \
--sysconfdir=${SLURM_CONFDIR} \
--disable-dependency-tracking \
$ARGS

make -j4 && make install
)

155 changes: 155 additions & 0 deletions ci/slurm_docker/slurm.conf.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=cluster
SlurmctldHost=SLURMCTLDHOST(SLURMCTLDADDR)
#SlurmctldHost=
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
MpiDefault=pmi2
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
#ProctrackType=proctrack/cgroup
ProctrackType=proctrack/linuxproc
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.%n.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd.%n
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_CPU
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=debug2
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=debug2
SlurmdLogFile=/var/log/slurm/slurmd.%n.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
#NodeName=nd[1-3] NodeHostname=DOCKER_HOSTNAME NodeAddr=127.0.0.1 Port=[6001-6003] CPUs=4 State=UNKNOWN
#PartitionName=dkr Nodes=ALL Default=YES MaxTime=INFINITE State=UP

0 comments on commit f5516cc

Please sign in to comment.