Skip to content

Commit

Permalink
Refactor Runner Destroy Reason Masking
Browse files Browse the repository at this point in the history
and ignore expected reasons such when the runner got destroyed by an API request.
  • Loading branch information
mpass99 committed Jul 24, 2023
1 parent 102b3f0 commit eb818f9
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 15 deletions.
12 changes: 9 additions & 3 deletions internal/api/ws/codeocean_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/json"
"errors"
"github.com/gorilla/websocket"
"github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/internal/runner"
"github.com/openHPI/poseidon/pkg/dto"
"io"
Expand Down Expand Up @@ -81,17 +82,22 @@ func (cw *codeOceanOutputWriter) StdErr() io.Writer {
// Close forwards the kind of exit (timeout, error, normal) to CodeOcean.
// This results in the closing of the WebSocket connection.
func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) {
// Mask the internal stop reason before disclosing/forwarding it externally/to CodeOcean.
switch {
case info.Err == nil:
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
case errors.Is(info.Err, context.DeadlineExceeded) || errors.Is(info.Err, runner.ErrorRunnerInactivityTimeout):
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout})
case errors.Is(info.Err, runner.ErrOOMKilled):
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: runner.ErrOOMKilled.Error()})
case info.Err != nil:
case errors.Is(info.Err, nomad.ErrorAllocationCompleted), errors.Is(info.Err, runner.ErrDestroyedByAPIRequest):
message := "the allocation stopped as expected"
log.WithContext(cw.ctx).WithError(info.Err).Debug(message)
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: message})
default:
errorMessage := "Error executing the request"
log.WithContext(cw.ctx).WithError(info.Err).Warn(errorMessage)
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: errorMessage})
default:
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
}
}

Expand Down
9 changes: 0 additions & 9 deletions internal/runner/nomad_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,15 +189,6 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string, reason error)

r, stillActive := m.usedRunners.Get(runnerID)
if stillActive {
// Mask the internal stop reason because the runner might disclose/forward it to CodeOcean/externally.
switch {
case errors.Is(reason, nomad.ErrorOOMKilled):
reason = ErrOOMKilled
default:
log.WithField(dto.KeyRunnerID, runnerID).WithField("reason", reason).Debug("Internal reason for allocation stop")
reason = ErrAllocationStopped
}

m.usedRunners.Delete(runnerID)
if err := r.Destroy(reason); err != nil {
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")
Expand Down
5 changes: 2 additions & 3 deletions internal/runner/nomad_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ var (
ErrorUnknownExecution = errors.New("unknown execution")
ErrorFileCopyFailed = errors.New("file copy failed")
ErrFileNotFound = errors.New("file not found or insufficient permissions")
ErrAllocationStopped DestroyReason = errors.New("the allocation stopped")
ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled
ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT")
Expand Down Expand Up @@ -244,7 +243,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
err = r.onDestroy(r)
}

if err == nil && (!errors.Is(reason, ErrAllocationStopped) || !errors.Is(reason, ErrOOMKilled)) {
if err == nil && !errors.Is(reason, ErrOOMKilled) {
err = util.RetryExponential(time.Second, func() (err error) {
if err = r.api.DeleteJob(r.ID()); err != nil {
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
Expand Down Expand Up @@ -331,7 +330,7 @@ func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- E
exit <- ExitInfo{255, err}

// This condition prevents further interaction with a stopped / dead allocation.
if errors.Is(err, ErrAllocationStopped) || errors.Is(err, ErrOOMKilled) {
if errors.Is(err, nomad.ErrorOOMKilled) {
return
}

Expand Down

0 comments on commit eb818f9

Please sign in to comment.