startdocker

#!/bin/bash

# startdocker: start a Docker daemon in the current container. Useful when
# using systems like the Gitlab Runner's Kubernetes executor that can't use
# ENTRYPOINT. If PORT is set, starts the daemon in the foreground. Else, starts
# the daemon in the background. Passes along DOCKER_DAEMON_ARGS to the daemon,
# and sends the log to a file if LOG is set to "file".
#
# When cgroups v2 are in use, and we are found to be running at the root of the
# visible cgroup hierarchy (i.e. in a plain Kubernetes container not enhanced
# with something like
# https://github.com/k3d-io/k3d/pull/579/files#diff-71e760f22ea8192fe65294b2330d4bd29fc3888fbf283ba4ac69fda1af3878dd),
# then we try to turn off all the confinement domains, bundle all the processes
# in the cgroup into a new child cgroup, and turn them back on again, so that
# Docker can actually use cgroups v2 to further confine sub-containers.
# Otherwise, Docker tries to make a cgroup directly under the container's root
# one, and if it tries to enable e.g. a memory limit on it, cgroups v2 refuses
# because running processes aren't allowed in cgroups with resource-limited
# children, because that makes scheduler correctness hard. See
# https://github.com/docker/for-mac/issues/6288#issuecomment-1250799498 and
# also the documentation for cgroups v2 at
# https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html.
#
# To *stop* Docker, use stopdocker
#
# If running without PORT, waits for Docker to be ready, and automatically
# succeeds if docker is already available.

if [ ! "$PORT" ] && [ -e /var/run/docker.sock ]
then
    # Someone must have already started Docker.
    echo 'Not starting docker: /var/run/docker.sock already exists' >&2
else
    # Ensure that all nodes in /dev/mapper correspond to mapped devices currently loaded by the device-mapper kernel driver
    dmsetup mknodes

    # First, make sure that cgroups are mounted correctly.
    CGROUP=/sys/fs/cgroup
    : {LOG:=stdio}

    [ -d $CGROUP ] ||
        mkdir $CGROUP

    mountpoint -q $CGROUP ||
        mount -n -t tmpfs -o uid=0,gid=0,mode=0755 cgroup $CGROUP || {
            echo "Could not make a tmpfs mount. Did you use --privileged?"
            exit 1
        }

    if [ -d /sys/kernel/security ] && ! mountpoint -q /sys/kernel/security
    then
        mount -t securityfs none /sys/kernel/security || {
            echo "Could not mount /sys/kernel/security."
            echo "AppArmor detection and --privileged mode might break."
        }
    fi

    # Mount the cgroup hierarchies exactly as they are in the parent system.
    # TODO: Is this needed anymore? The Docker dind removed it.
    for SUBSYS in $(cut -d: -f2 /proc/1/cgroup)
    do
            [ -d $CGROUP/$SUBSYS ] || mkdir $CGROUP/$SUBSYS
            mountpoint -q $CGROUP/$SUBSYS ||
                    mount -n -t cgroup -o $SUBSYS cgroup $CGROUP/$SUBSYS

            # The two following sections address a bug which manifests itself
            # by a cryptic "lxc-start: no ns_cgroup option specified" when
            # trying to start containers withina container.
            # The bug seems to appear when the cgroup hierarchies are not
            # mounted on the exact same directories in the host, and in the
            # container.

            # Named, control-less cgroups are mounted with "-o name=foo"
            # (and appear as such under /proc/<pid>/cgroup) but are usually
            # mounted on a directory named "foo" (without the "name=" prefix).
            # Systemd and OpenRC (and possibly others) both create such a
            # cgroup. To avoid the aforementioned bug, we symlink "foo" to
            # "name=foo". This shouldn't have any adverse effect.
            # But this also tends to produce permissin errors, so we drop
            # error output.
            echo $SUBSYS | grep -q ^name= && {
                    NAME=$(echo $SUBSYS | sed s/^name=//)
                    ln -s $SUBSYS $CGROUP/$NAME 2>/dev/null
            }

            # Likewise, on at least one system, it has been reported that
            # systemd would mount the CPU and CPU accounting controllers
            # (respectively "cpu" and "cpuacct") with "-o cpuacct,cpu"
            # but on a directory called "cpu,cpuacct" (note the inversion
            # in the order of the groups). This tries to work around it.
            [ $SUBSYS = cpuacct,cpu ] && ln -s $SUBSYS $CGROUP/cpu,cpuacct \
                                         2>/dev/null
    done

    # Note: as I write those lines, the LXC userland tools cannot setup
    # a "sub-container" properly if the "devices" cgroup is not in its
    # own hierarchy. Let's detect this and issue a warning.
    grep -q :devices: /proc/1/cgroup ||
        echo "WARNING: the 'devices' cgroup should be in its own hierarchy."
    grep -qw devices /proc/1/cgroup ||
        echo "WARNING: it looks like the 'devices' cgroup is not mounted."

    # Now, close extraneous file descriptors.
    pushd /proc/self/fd >/dev/null
    for FD in *
    do
        case "$FD" in
        # Keep stdin/stdout/stderr
        [012])
            ;;
        # Nuke everything else
        *)
            eval exec "$FD>&-"
            ;;
        esac
    done
    popd >/dev/null


    # If a pidfile is still around (for example after a container restart),
    # delete it so that docker can start.
    rm -rf /var/run/docker.pid
    
    if grep ^$$\$ /sys/fs/cgroup/cgroup.procs >/dev/null && [ -e /sys/fs/cgroup/cgroup.controllers ]; then
        # We need to move to a child cgroup. See
        # https://github.com/moby/moby/blob/ed89041433a031cafc0a0f19cfe573c31688d377/hack/dind#L28-L37
        # TODO: when containers change to get set up in a way that lets
        # Docker just make its own v2 cgroup properly, stop trying to do
        # this. See
        # <https://github.com/docker/for-mac/issues/6288#issuecomment-1250799498>
        echo "Moving everybody to a child cgroup. This may not work if we can't escape our cgroup!"
        set -x
        
        # The root cgroup we start in in a container might not actually have
        # any controllers on yet, but we still need to move.
        ACTIVE_CONTROLLERS="$(cat /sys/fs/cgroup/cgroup.subtree_control)"
        
        # Prepend plusses and minuses
        CONTROLLERS_OFF="$(echo "${WANTED_CONTROLLERS}" | sed 's/\(^\| \)\([^ ]\)/\1-\2/g')"
        CONTROLLERS_ON="$(echo "${WANTED_CONTROLLERS}" | sed 's/\(^\| \)\([^ ]\)/\1+\2/g')"
        
        # Turn off our subtree's controllers
        echo "${CONTROLLERS_OFF}" > /sys/fs/cgroup/cgroup.subtree_control
        
        # Make a new cgroup under this one. Hope it isn't used yet.
        mkdir -p /sys/fs/cgroup/init
        
        # Since the controllers are all off we can have child processes in
        # a child cgroup, so move everybody one at a time and hope nobody
        # is forking. Each process needs to move in its own write() call.
        cat /sys/fs/cgroup/cgroup.procs | xargs -rn 1 echo >/sys/fs/cgroup/init/cgroup.procs
        # Note that we still see the same cgroup hierarchy root even though
        # we should now be in a child cgroup of where we were before.
        # TODO: We will get "echo: write error: No such process" from this
        # sometimes. Why? And can we drop whatever that is from the list in
        # advance?
        
        # Now turn the controllers back on again
        echo "${CONTROLLERS_ON}" > /sys/fs/cgroup/cgroup.subtree_control
            
        set +x
    fi

    # If we were given a PORT environment variable, start Docker in the foreground.
    # Otherwise, start it in the background
    if [ "$PORT" ]
    then
        exec dockerd -H 0.0.0.0:$PORT -H unix:///var/run/docker.sock \
            $DOCKER_DAEMON_ARGS
    else
        if [ "$LOG" == "file" ]
        then
            (dockerd $DOCKER_DAEMON_ARGS &>/var/log/docker.log &)
        else
            (dockerd $DOCKER_DAEMON_ARGS &)
        fi
    fi
fi

# We didn't exec, so wait for Docker to be ready
(( timeout = 60 + SECONDS ))
until docker info >/dev/null 2>&1 && [ -e /var/run/docker.pid ]
do
    if (( SECONDS >= timeout )); then
        echo 'Timed out trying to connect to internal docker host.' >&2
        break
    fi
    sleep 1
done