Skip to content

Commit

Permalink
sig profile hanging workers before SIGKILL
Browse files Browse the repository at this point in the history
  • Loading branch information
IanButterworth committed Jan 25, 2025
1 parent e1ac9f7 commit 190a6b6
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 4 deletions.
15 changes: 12 additions & 3 deletions src/managers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ addprocs([
* `exeflags`: additional flags passed to the worker processes. It can either be a `Cmd`, a `String`
holding one flag, or a collection of strings, with one element per flag.
E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`.
E.g. `\`--threads=auto project=.\``, `"--compile-trace=stderr"` or `["--threads=auto", "--compile=all"]`.
* `topology`: Specifies how the workers connect to each other. Sending a message between
unconnected workers results in an error.
Expand Down Expand Up @@ -740,16 +740,25 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig)
nothing
end

function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15)
function kill(manager::LocalManager, pid::Int, config::WorkerConfig; profile_wait = 6, exit_timeout = 15, term_timeout = 15)
# profile_wait = 6 is 1s for profile, 5s for the report to show
# First, try sending `exit()` to the remote over the usual control channels
remote_do(exit, pid)

profile_sig = Sys.iswindows() ? nothing : Sys.isbsd() ? ("SIGINFO", 29) : ("SIGUSR1" , 10)

timer_task = @async begin
sleep(exit_timeout)

# Check to see if our child exited, and if not, send an actual kill signal
if !process_exited(config.process)
@warn("Failed to gracefully kill worker $(pid), sending SIGQUIT")
@warn "Failed to gracefully kill worker $(pid)"
if profile_sig !== nothing
@info("Sending profile $(profile_sig[1]) to worker $(pid)")
kill(config.process, profile_sig[2])
sleep(profile_wait)
end
@warn("Sending SIGQUIT to worker $(pid)")
kill(config.process, Base.SIGQUIT)

sleep(term_timeout)
Expand Down
2 changes: 1 addition & 1 deletion test/distributed_exec.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1936,7 +1936,7 @@ begin

# Next, ensure we get a log message when a worker does not cleanly exit
w = only(addprocs(1))
@test_logs (:warn, r"sending SIGQUIT") begin
@test_logs (:warn, r"Sending SIGQUIT") match_mode=:any begin
remote_do(w) do
# Cause the 'exit()' message that `rmprocs()` sends to do nothing
Core.eval(Base, :(exit() = nothing))
Expand Down

0 comments on commit 190a6b6

Please sign in to comment.