Skip to content

Commit

Permalink
v2: reapChildren should use the pid and not the pgid when pgid==0 (#410)
Browse files Browse the repository at this point in the history
* kill whole process group when killing on timeout

* send reapChildren the pid rather the pgid when the pgid is 0.

Whenever our pgid is 0, the reapChildren function won't have anything
to reap so we end up with zombies. So in that case we'll pass the pid;
we need to move this check to after the start of the process so that we
have a valid pid.

* ensure zombies reaping test fails correctly

* revert pgid fix for RunAndWaitForOutput

we don't ever get access to the pid in this method because the golang
stdlib is blocking here; by the time the Output call returns the pid
is gone and we get a NPE.
  • Loading branch information
tgross authored Jun 23, 2017
1 parent c45436a commit ec15171
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 33 deletions.
36 changes: 27 additions & 9 deletions commands/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,17 @@ func RunAndWait(c *Command) (int, error) {
fmt.Sprintf("CONTAINERPILOT_%s_PID", strings.ToUpper(c.Name)),
fmt.Sprintf("%v", c.Cmd.Process.Pid),
)
defer reapChildren(c.Cmd.SysProcAttr.Pgid)
// See https://golang.org/src/syscall/exec_linux.go
// SysProcAttr.Setpgid:
// "Set process group ID to Pgid, or, if Pgid == 0, to new pid"
// So in the case where ContainerPilot is PID1 this will fail
// to reap zombies unless we pass the PID and not the PGID to
// the syscall.Wait4 in reapChildren
pgid := c.Cmd.SysProcAttr.Pgid
if pgid == 0 {
pgid = c.Cmd.Process.Pid
}
defer reapChildren(pgid)
err := c.Cmd.Wait()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
Expand Down Expand Up @@ -125,6 +135,7 @@ func RunAndWaitForOutput(c *Command) (string, error) {
// to write to our collector.
c.Cmd.Stderr = os.Stderr
log.Debugf("%s.Cmd.Output", c.Name)

defer reapChildren(c.Cmd.SysProcAttr.Pgid)
out, err := c.Cmd.Output()
if err != nil {
Expand Down Expand Up @@ -168,13 +179,12 @@ func (c *Command) setUpCmd() {
}

// Kill sends a kill signal to the underlying process.
func (c *Command) Kill() error {
func (c *Command) Kill() {
log.Debugf("%s.kill", c.Name)
if c.Cmd != nil && c.Cmd.Process != nil {
log.Warnf("killing command for %s", c.Name)
return c.Cmd.Process.Kill()
syscall.Kill(-c.Cmd.Process.Pid, syscall.SIGKILL)
}
return nil
}

func (c *Command) waitForTimeout() error {
Expand All @@ -193,10 +203,7 @@ func (c *Command) waitForTimeout() error {
select {
case <-ticker.C:
log.Warnf("%s timeout after %s: '%s'", c.Name, c.Timeout, c.Args)
if err := c.Kill(); err != nil {
log.Errorf("error killing command: %v", err)
return
}
c.Kill()
log.Debugf("%s.run#gofunc swallow quit", c.Name)
// Swallow quit signal
<-quit
Expand All @@ -213,7 +220,18 @@ func (c *Command) waitForTimeout() error {
}
log.Debugf("%s.run waiting", c.Name)

defer reapChildren(c.Cmd.SysProcAttr.Pgid)
// See https://golang.org/src/syscall/exec_linux.go
// SysProcAttr.Setpgid:
// "Set process group ID to Pgid, or, if Pgid == 0, to new pid"
// So in the case where ContainerPilot is PID1 this will fail
// to reap zombies unless we pass the PID and not the PGID to
// the syscall.Wait4 in reapChildren
pgid := c.Cmd.SysProcAttr.Pgid
if pgid == 0 {
pgid = c.Cmd.Process.Pid
}
defer reapChildren(pgid)

err := c.Cmd.Wait()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
Expand Down
11 changes: 3 additions & 8 deletions commands/commands_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,14 @@ func TestRunWithTimeout(t *testing.T) {

// Ensure the task has time to start
runtime.Gosched()
// Wait for task to start + 250ms
// Wait for task to start + 450ms
ticker := time.NewTicker(650 * time.Millisecond)
select {
case <-ticker.C:
ticker.Stop()
err := cmd.Kill() // make sure we don't keep running
if err == nil {
// firing Kill should throw an error
if cmd.Cmd.ProcessState.Success() {
cmd.Kill() // make sure we don't keep running even if we failed
t.Fatalf("Command was not stopped by timeout")
} else {
if err.Error() != "os: process already finished" {
t.Fatalf("Command.Kill got unexpected error: %s", err)
}
}
}
}
Expand Down
29 changes: 13 additions & 16 deletions integration_tests/tests/test_reap_zombies/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
set -e
# Test to verify that we're correctly reaping zombies.
# At any given time we may have up to 1 zombie parented to PID1 (it has been
# reparented but not yet reaped) and 1 zombie not yet parented to PID1.
Expand All @@ -14,23 +15,19 @@ if [ ! $? -eq 0 ] ; then exit 1 ; fi
APP_ID="$(docker-compose ps -q app)"
sleep 6

PTREE=$(docker exec $APP_ID ps -o stat,ppid,pid,comm)
REPARENTED_ZOMBIES=$(echo $PTREE | awk '
BEGIN { count=0 }
$1 ~ /^Z/ && $2 ~ /^1$/ { count++ }
END { print count }
')
PTREE=$(docker exec "$APP_ID" ps -o stat,ppid,pid,comm)

TOTAL_ZOMBIES=$(echo $PTREE | awk '
BEGIN { count=0 }
$1 ~ /^Z/ { count++ }
END { print count }
')
set +e
REPARENTED_ZOMBIES=$(echo "$PTREE" | awk -F' +' '/^Z/{print $2}' | grep -c "^1$")
TOTAL_ZOMBIES=$(echo "$PTREE" | grep -c "^Z")
set -e

if [ $REPARENTED_ZOMBIES -gt 1 ] || [ $TOTAL_ZOMBIES -gt 2 ]; then
echo "More than permitted number of zombies." >&2
echo $PTREE >&2
docker logs $APP_ID
exit 1
if [ "$REPARENTED_ZOMBIES" -gt 1 ] || [ "$TOTAL_ZOMBIES" -gt 2 ]; then
echo "More than permitted number of zombies." >&2
echo "- got $REPARENTED_ZOMBIES reparented zombies" >&2
echo "- got $TOTAL_ZOMBIES total zombies" >&2
echo "$PTREE" >&2
docker logs "${APP_ID}" > app.log
exit 1
fi
exit 0

0 comments on commit ec15171

Please sign in to comment.