Skip to content

Commit

Permalink
Refactor ntasks_running and make sure it doesn't go out of bounds.
Browse files Browse the repository at this point in the history
We change it to `ntasks_stopped` and make it track precisely the number of tasks belonging to the session that have `is_stopped` set to true.
  • Loading branch information
rocallahan committed Aug 15, 2023
1 parent 5442ecf commit 4af8558
Show file tree
Hide file tree
Showing 7 changed files with 91 additions and 37 deletions.
10 changes: 6 additions & 4 deletions src/RecordSession.cc
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ static bool handle_ptrace_exit_event(RecordTask* t) {
// If we died because of a coredumping signal, that is a barrier event, and
// every task in the address space needs to pass its PTRACE_EXIT_EVENT before
// they proceed to (potentially hidden) zombie state, so we can't wait for
// that to happen/
// that to happen.
// Similarly we can't wait for this task to exit if there are other
// tasks in its pid namespace that need to exit and this is the last thread
// of pid-1 in that namespace, because the kernel must reap them before
Expand Down Expand Up @@ -924,9 +924,6 @@ void RecordSession::task_continue(const StepState& step_state) {
}
}
t->resume_execution(resume, RESUME_NONBLOCKING, ticks_request);
if (t->is_running()) {
scheduler().started(t);
}
}

/**
Expand Down Expand Up @@ -2515,6 +2512,11 @@ RecordSession::RecordResult RecordSession::record_step() {
return result;
}
RecordTask* t = scheduler().current();
if (!t) {
// No child to schedule. Yield to our caller to give it a chance
// to do something (e.g. terminate the recording).
return result;
}
if (t->waiting_for_reap) {
// Give it another chance to be reaped
t->did_reach_zombie();
Expand Down
15 changes: 15 additions & 0 deletions src/RecordTask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,9 @@ RecordTask::~RecordTask() {
}
did_kill();
}

// If this was stopped, notify the scheduler.
set_stopped(false);
}

void RecordTask::record_exit_event(WriteChildTid write_child_tid) {
Expand Down Expand Up @@ -854,6 +857,18 @@ void RecordTask::did_reach_zombie() {
}
}

void RecordTask::set_stopped(bool stopped) {
if (is_stopped == stopped) {
return;
}
is_stopped = stopped;
if (stopped) {
session().scheduler().stopped_task(this);
} else {
session().scheduler().started_task(this);
}
}

void RecordTask::send_synthetic_SIGCHLD_if_necessary() {
RecordTask* wake_task = nullptr;
bool need_signal = false;
Expand Down
2 changes: 2 additions & 0 deletions src/RecordTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,8 @@ class RecordTask final : public Task {
robust_futex_list_len = len;
}

void set_stopped(bool stopped) override;

private:
/* Retrieve the tid of this task from the tracee and store it */
void update_own_namespace_tid();
Expand Down
66 changes: 44 additions & 22 deletions src/Scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ Scheduler::Scheduler(RecordSession& session)
must_run_task(nullptr),
pretend_num_cores_(1),
in_exec_tgid(0),
ntasks_stopped(0),
always_switch(false),
enable_chaos(false),
enable_poll(false),
Expand Down Expand Up @@ -289,7 +290,6 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator,
return false;
}


LOGM(debug) << "Task event is " << t->ev();
if (!t->may_be_blocked()) {
LOGM(debug) << " " << t->tid << " isn't blocked";
Expand Down Expand Up @@ -352,10 +352,10 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator,
// the task is not stopped yet if we pass WNOHANG. To make them
// behave predictably, do a blocking wait.
t->wait();
ntasks_running--;
*by_waitpid = true;
must_run_task = t;
LOGM(debug) << " sched_yield ready with status " << t->status();
LOGM(debug) << " " << syscall_name(t->ev().Syscall().number, t->arch())
<< " ready with status " << t->status();
return true;
} else {
LOGM(debug) << " " << t->tid << " is blocked on " << t->ev()
Expand All @@ -371,7 +371,6 @@ bool Scheduler::is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator,
return false;
}
*by_waitpid = true;
ntasks_running--;
must_run_task = t;
return true;
}
Expand Down Expand Up @@ -526,7 +525,7 @@ void Scheduler::validate_scheduled_task() {
* and `tid` and `status` are valid, or false if the wait was interrupted
* (by timeout or some other signal).
*/
static bool wait_any(pid_t& tid, WaitStatus& status, double timeout) {
static WaitResultCode wait_any(pid_t& tid, WaitStatus& status, double timeout) {
WaitOptions options;
if (timeout > 0) {
options.block_seconds = timeout;
Expand All @@ -536,21 +535,18 @@ static bool wait_any(pid_t& tid, WaitStatus& status, double timeout) {
case WAIT_OK:
tid = result.tid;
status = result.status;
return true;
break;
case WAIT_NO_STATUS:
LOGM(debug) << " wait interrupted";
return false;
break;
case WAIT_NO_CHILD:
// It's possible that the original thread group was detached,
// and the only thing left we were waiting for, in which case we
// get ECHILD here. Just abort this record step, so the caller
// can end the record session.
return false;
LOGM(debug) << " no child to wait for";
break;
default:
FATAL() << "Unknown result code";
return false;
break;
}
return true;
return result.code;
}

/**
Expand Down Expand Up @@ -639,7 +635,24 @@ static RecordTask* find_waited_task(RecordSession& session, pid_t tid, WaitStatu
}

bool Scheduler::may_use_unlimited_ticks() {
return ntasks_running == session.tasks().size() - 1;
return ntasks_stopped == 1;
}

void Scheduler::started_task(RecordTask* t) {
LOGM(debug) << "Starting " << t->tid;
if (may_use_unlimited_ticks()) {
unlimited_ticks_mode = true;
}
--ntasks_stopped;
ASSERT(t, ntasks_stopped >= 0);
}

void Scheduler::stopped_task(RecordTask* t) {
LOGM(debug) << "Stopping " << t->tid;
++ntasks_stopped;
// When a task is created/cloned it temporarily can be stopped
// but not in our task set.
ASSERT(t, ntasks_stopped <= static_cast<int>(session.tasks().size()) + 1);
}

Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
Expand Down Expand Up @@ -674,20 +687,21 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
// tracer. However, this does mean we need to be on the look out for
// other tasks becoming runnable, which we usually check on timeslice
// expiration.
ASSERT(current_, ntasks_running == session.tasks().size());
ASSERT(current_, !ntasks_stopped);
pid_t tid;
WaitStatus status;
if (!wait_any(tid, status, -1)) {
WaitResultCode wait_result = wait_any(tid, status, -1);
if (wait_result == WAIT_NO_STATUS) {
ASSERT(current_, !must_run_task);
result.interrupted_by_signal = true;
return result;
}
ASSERT(current_, wait_result == WAIT_OK);
RecordTask *waited = find_waited_task(session, tid, status);
if (!waited) {
continue;
}
waited->did_waitpid(status);
ntasks_running--;
// Another task just became runnable, we're no longer in unlimited
// ticks mode
unlimited_ticks_mode = false;
Expand All @@ -706,7 +720,6 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
LOGM(debug) << " But that's not our current task...";
} else {
current_->wait(timeout);
ntasks_running--;
break;
}
}
Expand Down Expand Up @@ -859,11 +872,21 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
do {
double timeout = enable_poll ? 1 : 0;
pid_t tid;
if (!wait_any(tid, status, timeout)) {
ASSERT(current_, !must_run_task);
WaitResultCode wait_result = wait_any(tid, status, timeout);
if (wait_result == WAIT_NO_STATUS) {
if (must_run_task) {
FATAL() << "must_run_task but no status?";
}
result.interrupted_by_signal = true;
return result;
}
if (wait_result == WAIT_NO_CHILD) {
// It's possible that the original thread group was detached,
// and the only thing left we were waiting for, in which case we
// get ECHILD here. Just abort this record step, so the caller
// can end the record session.
return result;
}
LOGM(debug) << " " << tid << " changed status to " << status;
next = find_waited_task(session, tid, status);
now = -1; // invalid, don't use
Expand All @@ -873,7 +896,6 @@ Scheduler::Rescheduled Scheduler::reschedule(Switchable switchable) {
status.ptrace_event() == PTRACE_EVENT_EXIT ||
status.reaped())
<< "Scheduled task should have been blocked";
ntasks_running--;
next->did_waitpid(status);
if (in_exec_tgid && next->tgid() != in_exec_tgid) {
// Some threadgroup is doing execve and this task isn't in
Expand Down
22 changes: 14 additions & 8 deletions src/Scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,16 @@ class Scheduler {
bool may_use_unlimited_ticks();

/**
* Let the scheduler know that the passed task has started running
* Let the scheduler know that the previously stopped task has resumed.
*/
void started(RecordTask*) {
if (may_use_unlimited_ticks()) {
unlimited_ticks_mode = true;
}
ntasks_running++;
}
void started_task(RecordTask* t);

/**
* Let the scheduler know that the previously running task has reached a kernel stop
* (typically a ptrace stop). Tasks that are blocked but not in a stop
* are still "running" for our purposes here.
*/
void stopped_task(RecordTask* t);

/**
* Let the scheduler know that the task has entered an execve.
Expand Down Expand Up @@ -284,6 +286,11 @@ class Scheduler {
*/
pid_t in_exec_tgid;

/**
* The number of tasks that have is_stopped set.
*/
int ntasks_stopped;

/**
* When true, context switch at every possible point.
*/
Expand All @@ -298,7 +305,6 @@ class Scheduler {
bool last_reschedule_in_high_priority_only_interval;

bool unlimited_ticks_mode;
size_t ntasks_running;
};

} // namespace rr
Expand Down
6 changes: 3 additions & 3 deletions src/Task.cc
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ void Task::wait_exit() {
// case we're only now catching up to the real process exit. In that
// case, just ask the process to actually exit. (TODO: We may want to
// catch this earlier).
return proceed_to_exit(true);
return proceed_to_exit();
}
ASSERT(this, result.status.ptrace_event() == PTRACE_EVENT_EXEC)
<< "Expected PTRACE_EVENT_EXEC, got " << result.status;
Expand Down Expand Up @@ -1525,7 +1525,7 @@ void Task::resume_execution(ResumeRequest how, WaitRequest wait_how,
} else {
ASSERT(this, setup_succeeded);
ptrace_if_alive(how, nullptr, (void*)(uintptr_t)sig);
is_stopped = false;
set_stopped(false);
extra_registers_known = false;
if (RESUME_WAIT == wait_how) {
wait();
Expand Down Expand Up @@ -2108,7 +2108,7 @@ void Task::did_waitpid(WaitStatus status) {
// Mark as stopped now. If we fail one of the ticks assertions below,
// the test-monitor (or user) might want to attach the emergency debugger,
// which needs to know that the tracee is stopped.
is_stopped = true;
set_stopped(true);

if (status.reaped()) {
was_reaped = true;
Expand Down
7 changes: 7 additions & 0 deletions src/Task.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ class Task {

/**
* Advance the task to its exit state if it's not already there.
* If `wait` is false, then during recording Scheduler::start() must be
* called.
*/
void proceed_to_exit(bool wait = true);

Expand Down Expand Up @@ -645,6 +647,11 @@ class Task {
*/
bool is_running() const { return !is_stopped; }

/**
* Setter for `is_stopped` to update `Scheduler::ntasks_stopped`.
*/
virtual void set_stopped(bool stopped) { is_stopped = stopped; }

/**
* Return the status of this as of the last successful wait()/try_wait() call.
*/
Expand Down

0 comments on commit 4af8558

Please sign in to comment.