Skip to content

Commit

Permalink
Fix OOM Kill race condition
Browse files Browse the repository at this point in the history
due to the Nomad request exiting before the allocation is stopped. We catch this behavior by introducing a time period for the allocation being stopped iff the exit code is 128.
  • Loading branch information
mpass99 committed Jul 21, 2023
1 parent 5a6b040 commit a41b62e
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 2 deletions.
26 changes: 24 additions & 2 deletions internal/runner/nomad_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,14 +288,36 @@ func (r *NomadJob) handleExitOrContextDone(ctx context.Context, cancelExecute co
case exitInfo := <-exitInternal:
// - The execution ended in time or
// - the HTTP request of the client/CodeOcean got canceled.
exit <- exitInfo
return
r.handleExit(exitInfo, exitInternal, exit, stdin, ctx)
case <-ctx.Done():
// - The execution timeout was exceeded,
// - the runner was destroyed (runner timeout, or API delete request), or
// - the WebSocket connection to the client/CodeOcean closed.
r.handleContextDone(exitInternal, exit, stdin, ctx)
}
}

func (r *NomadJob) handleExit(exitInfo ExitInfo, exitInternal <-chan ExitInfo, exit chan<- ExitInfo,
stdin io.ReadWriter, ctx context.Context) {
// Special Handling of OOM Killed allocations. See #401.
const exitCodeOfLikelyOOMKilledAllocation = 128
const gracePeriodForConfirmingOOMKillReason = time.Second
if exitInfo.Code == exitCodeOfLikelyOOMKilledAllocation {
select {
case <-ctx.Done():
// We consider this allocation to be OOM Killed.
r.handleContextDone(exitInternal, exit, stdin, ctx)
return
case <-time.After(gracePeriodForConfirmingOOMKillReason):
// We consider that the payload code returned the exit code.
}
}

exit <- exitInfo
}

func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- ExitInfo,
stdin io.ReadWriter, ctx context.Context) {
err := ctx.Err()
if reason, ok := r.ctx.Value(destroyReasonContextKey).(error); ok {
err = reason
Expand Down
41 changes: 41 additions & 0 deletions internal/runner/nomad_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,47 @@ func (s *ExecuteInteractivelyTestSuite) TestDestroyReasonIsPassedToExecution() {
s.ErrorIs(exit.Err, ErrOOMKilled)
}

func (s *ExecuteInteractivelyTestSuite) TestSuspectedOOMKilledExecutionWaitsForVerification() {
s.mockedExecuteCommandCall.Return(128, nil)
executionRequest := &dto.ExecutionRequest{}
s.Run("Actually OOM Killed", func() {
s.runner.StoreExecution(defaultExecutionID, executionRequest)
exitChannel, _, err := s.runner.ExecuteInteractively(
defaultExecutionID, &nullio.ReadWriter{}, nil, nil, context.Background())
s.Require().NoError(err)

select {
case <-exitChannel:
s.FailNow("For exit code 128 Poseidon should wait a while to verify the OOM Kill assumption.")
case <-time.After(tests.ShortTimeout):
// All good. Poseidon waited.
}

err = s.runner.Destroy(ErrOOMKilled)
s.Require().NoError(err)
exit := <-exitChannel
s.ErrorIs(exit.Err, ErrOOMKilled)
})

ctx, cancel := context.WithCancel(context.Background())
s.runner.ctx = ctx
s.runner.cancel = cancel
s.Run("Not OOM Killed", func() {
s.runner.StoreExecution(defaultExecutionID, executionRequest)
exitChannel, _, err := s.runner.ExecuteInteractively(
defaultExecutionID, &nullio.ReadWriter{}, nil, nil, context.Background())
s.Require().NoError(err)

select {
case <-time.After(tests.ShortTimeout + time.Second):
s.FailNow("Poseidon should not wait too long for verifying the OOM Kill assumption.")
case exit := <-exitChannel:
s.Equal(uint8(128), exit.Code)
s.Nil(exit.Err)
}
})
}

func TestUpdateFileSystemTestSuite(t *testing.T) {
suite.Run(t, new(UpdateFileSystemTestSuite))
}
Expand Down

0 comments on commit a41b62e

Please sign in to comment.