diff --git a/internal/runner/nomad_runner.go b/internal/runner/nomad_runner.go index a9989b07..69f28adf 100644 --- a/internal/runner/nomad_runner.go +++ b/internal/runner/nomad_runner.go @@ -288,14 +288,36 @@ func (r *NomadJob) handleExitOrContextDone(ctx context.Context, cancelExecute co case exitInfo := <-exitInternal: // - The execution ended in time or // - the HTTP request of the client/CodeOcean got canceled. - exit <- exitInfo - return + r.handleExit(exitInfo, exitInternal, exit, stdin, ctx) case <-ctx.Done(): // - The execution timeout was exceeded, // - the runner was destroyed (runner timeout, or API delete request), or // - the WebSocket connection to the client/CodeOcean closed. + r.handleContextDone(exitInternal, exit, stdin, ctx) + } +} + +func (r *NomadJob) handleExit(exitInfo ExitInfo, exitInternal <-chan ExitInfo, exit chan<- ExitInfo, + stdin io.ReadWriter, ctx context.Context) { + // Special Handling of OOM Killed allocations. See #401. + const exitCodeOfLikelyOOMKilledAllocation = 128 + const gracePeriodForConfirmingOOMKillReason = time.Second + if exitInfo.Code == exitCodeOfLikelyOOMKilledAllocation { + select { + case <-ctx.Done(): + // We consider this allocation to be OOM Killed. + r.handleContextDone(exitInternal, exit, stdin, ctx) + return + case <-time.After(gracePeriodForConfirmingOOMKillReason): + // We consider that the payload code returned the exit code. + } } + exit <- exitInfo +} + +func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- ExitInfo, + stdin io.ReadWriter, ctx context.Context) { err := ctx.Err() if reason, ok := r.ctx.Value(destroyReasonContextKey).(error); ok { err = reason diff --git a/internal/runner/nomad_runner_test.go b/internal/runner/nomad_runner_test.go index d8ee6b40..d90c393b 100644 --- a/internal/runner/nomad_runner_test.go +++ b/internal/runner/nomad_runner_test.go @@ -263,6 +263,47 @@ func (s *ExecuteInteractivelyTestSuite) TestDestroyReasonIsPassedToExecution() { s.ErrorIs(exit.Err, ErrOOMKilled) } +func (s *ExecuteInteractivelyTestSuite) TestSuspectedOOMKilledExecutionWaitsForVerification() { + s.mockedExecuteCommandCall.Return(128, nil) + executionRequest := &dto.ExecutionRequest{} + s.Run("Actually OOM Killed", func() { + s.runner.StoreExecution(defaultExecutionID, executionRequest) + exitChannel, _, err := s.runner.ExecuteInteractively( + defaultExecutionID, &nullio.ReadWriter{}, nil, nil, context.Background()) + s.Require().NoError(err) + + select { + case <-exitChannel: + s.FailNow("For exit code 128 Poseidon should wait a while to verify the OOM Kill assumption.") + case <-time.After(tests.ShortTimeout): + // All good. Poseidon waited. + } + + err = s.runner.Destroy(ErrOOMKilled) + s.Require().NoError(err) + exit := <-exitChannel + s.ErrorIs(exit.Err, ErrOOMKilled) + }) + + ctx, cancel := context.WithCancel(context.Background()) + s.runner.ctx = ctx + s.runner.cancel = cancel + s.Run("Not OOM Killed", func() { + s.runner.StoreExecution(defaultExecutionID, executionRequest) + exitChannel, _, err := s.runner.ExecuteInteractively( + defaultExecutionID, &nullio.ReadWriter{}, nil, nil, context.Background()) + s.Require().NoError(err) + + select { + case <-time.After(tests.ShortTimeout + time.Second): + s.FailNow("Poseidon should not wait too long for verifying the OOM Kill assumption.") + case exit := <-exitChannel: + s.Equal(uint8(128), exit.Code) + s.Nil(exit.Err) + } + }) +} + func TestUpdateFileSystemTestSuite(t *testing.T) { suite.Run(t, new(UpdateFileSystemTestSuite)) }