Skip to content

Commit

Permalink
i#7023: Switch drmemtrace replay of idles to use record count (#7036)
Browse files Browse the repository at this point in the history
The drmemtrace replay of idles was using wall-clock time to reproduce
the duration. A new idle replay type is added here (and made the
default) which uses the idle record count instead, leaving support for
the old format under the old type.

Idle during replay now properly returns STATUS_IDLE, rather than
STATUS_WAIT.

Tested the new idle type on the existing record-replay tests inside
scheduler_unit_tests and in core_sharded_test.

Support for the old format is considered best-effort as its use of
wall-clock time makes it not able to reproduce precise timing in any
case. Tested manually by generating an old-style record and ensuring the
new code handled it:

$ clients/bin64/drmemtrace_launcher -indir
../src/clients/drcachesim/tests/drmemtrace.threadsig.x64.tracedir
-core_sharded -cores 4 -replay_file
../build_x64_rel_tests/legacy_record.zip

Fixes #7023
  • Loading branch information
derekbruening authored Oct 10, 2024
1 parent 8e694d8 commit 5f4a4c7
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 49 deletions.
116 changes: 69 additions & 47 deletions clients/drcachesim/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ replay_file_checker_t::check(archive_istream_t *infile)
scheduler_t::schedule_record_t record;
bool prev_was_idle = false;
while (infile->read(reinterpret_cast<char *>(&record), sizeof(record))) {
if (record.type == scheduler_t::schedule_record_t::IDLE) {
if (record.type == scheduler_t::schedule_record_t::IDLE ||
record.type == scheduler_t::schedule_record_t::IDLE_BY_COUNT) {
if (prev_was_idle)
return "Error: consecutive idle records";
prev_was_idle = true;
Expand Down Expand Up @@ -1305,10 +1306,17 @@ scheduler_tmpl_t<RecordType, ReaderType>::read_recorded_schedule()
VPRINT(this, 1, "output %d empty: returning eof up front\n", i);
set_cur_input(i, INVALID_INPUT_ORDINAL);
outputs_[i].at_eof = true;
} else if (outputs_[i].record[0].type == schedule_record_t::IDLE) {
} else if (outputs_[i].record[0].type == schedule_record_t::IDLE ||
outputs_[i].record[0].type == schedule_record_t::IDLE_BY_COUNT) {
set_cur_input(i, INVALID_INPUT_ORDINAL);
outputs_[i].waiting = true;
outputs_[i].wait_start_time = 0; // Updated on first next_record().
if (outputs_[i].record[0].type == schedule_record_t::IDLE) {
// Convert a legacy idle duration from microseconds to record counts.
outputs_[i].record[0].value.idle_duration =
static_cast<uint64_t>(options_.time_units_per_us *
outputs_[i].record[0].value.idle_duration);
}
outputs_[i].idle_start_count = -1; // Updated on first next_record().
VPRINT(this, 3, "output %d starting out idle\n", i);
} else {
assert(outputs_[i].record[0].type == schedule_record_t::DEFAULT);
Expand Down Expand Up @@ -2531,17 +2539,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::record_schedule_segment(
input_ordinal_t input, uint64_t start_instruction, uint64_t stop_instruction)
{
assert(type == schedule_record_t::VERSION || type == schedule_record_t::FOOTER ||
type == schedule_record_t::IDLE || inputs_[input].lock->owned_by_cur_thread());
// ::IDLE is a legacy type we should not see in new recordings.
type == schedule_record_t::IDLE_BY_COUNT ||
inputs_[input].lock->owned_by_cur_thread());
// We always use the current wall-clock time, as the time stored in the prior
// next_record() call can be out of order across outputs and lead to deadlocks.
uint64_t timestamp = get_time_micros();
if (type == schedule_record_t::IDLE &&
outputs_[output].record.back().type == schedule_record_t::IDLE) {
if (type == schedule_record_t::IDLE_BY_COUNT &&
outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT) {
// Merge. We don't need intermediate timestamps when idle, and consecutive
// idle records quickly balloon the file.
return sched_type_t::STATUS_OK;
}
VPRINT(this, 4,
if (type == schedule_record_t::IDLE_BY_COUNT) {
// Start prior to this idle.
outputs_[output].idle_start_count = outputs_[output].idle_count - 1;
// That is what we'll record in the value union shared w/ start_instruction.
assert(start_instruction == outputs_[output].idle_count - 1);
}
VPRINT(this, 3,
"recording out=%d type=%d input=%d start=%" PRIu64 " stop=%" PRIu64
" time=%" PRIu64 "\n",
output, type, input, start_instruction, stop_instruction, timestamp);
Expand All @@ -2560,24 +2576,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::close_schedule_segment(output_ordinal_
assert(!outputs_[output].record.empty());
assert(outputs_[output].record.back().type == schedule_record_t::VERSION ||
outputs_[output].record.back().type == schedule_record_t::FOOTER ||
outputs_[output].record.back().type == schedule_record_t::IDLE ||
// ::IDLE is for legacy recordings, not new ones.
outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT ||
input.lock->owned_by_cur_thread());
if (outputs_[output].record.back().type == schedule_record_t::SKIP) {
// Skips already have a final stop value.
return sched_type_t::STATUS_OK;
}
if (outputs_[output].record.back().type == schedule_record_t::IDLE) {
// Just like in record_schedule_segment() we use wall-clock time for recording
// replay timestamps.
uint64_t end = get_time_micros();
assert(end >= outputs_[output].record.back().timestamp);
if (outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT) {
uint64_t end_idle_count = outputs_[output].idle_count;
assert(outputs_[output].idle_start_count >= 0);
assert(end_idle_count >=
static_cast<uint64_t>(outputs_[output].idle_start_count));
outputs_[output].record.back().value.idle_duration =
end - outputs_[output].record.back().timestamp;
end_idle_count - outputs_[output].idle_start_count;
VPRINT(this, 3,
"close_schedule_segment: idle duration %" PRIu64 " = %" PRIu64
"close_schedule_segment[%d]: idle duration %" PRIu64 " = %" PRIu64
" - %" PRIu64 "\n",
outputs_[output].record.back().value.idle_duration, end,
outputs_[output].record.back().timestamp);
output, outputs_[output].record.back().value.idle_duration, end_idle_count,
outputs_[output].idle_start_count);
return sched_type_t::STATUS_OK;
}
uint64_t instr_ord = get_instr_ordinal(input);
Expand All @@ -2596,9 +2613,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::close_schedule_segment(output_ordinal_
++instr_ord;
}
VPRINT(this, 3,
"close_schedule_segment: input=%d type=%d start=%" PRIu64 " stop=%" PRIu64
"close_schedule_segment[%d]: input=%d type=%d start=%" PRIu64 " stop=%" PRIu64
"\n",
input.index, outputs_[output].record.back().type,
output, input.index, outputs_[output].record.back().type,
outputs_[output].record.back().value.start_instruction, instr_ord);
// Check for empty default entries, except the starter 0,0 ones.
assert(outputs_[output].record.back().type != schedule_record_t::DEFAULT ||
Expand Down Expand Up @@ -2927,7 +2944,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
add_to_ready_queue(output, &inputs_[prev_input]);
}
} else if (options_.schedule_record_ostream != nullptr &&
outputs_[output].record.back().type == schedule_record_t::IDLE) {
(outputs_[output].record.back().type == schedule_record_t::IDLE ||
outputs_[output].record.back().type ==
schedule_record_t::IDLE_BY_COUNT)) {
input_info_t unused;
sched_type_t::stream_status_t status = close_schedule_segment(output, unused);
if (status != sched_type_t::STATUS_OK)
Expand Down Expand Up @@ -3038,19 +3057,27 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
}
return eof_or_idle(output, outputs_[output].cur_input);
}
const schedule_record_t &segment = outputs_[output].record[record_index + 1];
if (segment.type == schedule_record_t::IDLE) {
schedule_record_t &segment = outputs_[output].record[record_index + 1];
if (segment.type == schedule_record_t::IDLE ||
segment.type == schedule_record_t::IDLE_BY_COUNT) {
outputs_[output].waiting = true;
outputs_[output].wait_start_time = get_output_time(output);
if (segment.type == schedule_record_t::IDLE) {
// Convert a legacy idle duration from microseconds to record counts.
segment.value.idle_duration = static_cast<uint64_t>(
options_.time_units_per_us * segment.value.idle_duration);
}
outputs_[output].idle_start_count = outputs_[output].idle_count;
outputs_[output].record_index->fetch_add(1, std::memory_order_release);
++outputs_[output].idle_count;
VPRINT(this, 5, "%s[%d]: next replay segment idle for %" PRIu64 "\n",
__FUNCTION__, output, segment.value.idle_duration);
return sched_type_t::STATUS_IDLE;
}
index = segment.key.input;
VPRINT(this, 5,
"pick_next_input_as_previously[%d]: next replay segment in=%d (@%" PRId64
") type=%d start=%" PRId64 " end=%" PRId64 "\n",
output, index, get_instr_ordinal(inputs_[index]), segment.type,
"%s[%d]: next replay segment in=%d (@%" PRId64 ") type=%d start=%" PRId64
" end=%" PRId64 "\n",
__FUNCTION__, output, index, get_instr_ordinal(inputs_[index]), segment.type,
segment.value.start_instruction, segment.stop_instruction);
{
std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
Expand Down Expand Up @@ -3404,7 +3431,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
if (options_.schedule_record_ostream != nullptr) {
sched_type_t::stream_status_t record_status =
record_schedule_segment(
output, schedule_record_t::IDLE, 0, 0, 0);
output, schedule_record_t::IDLE_BY_COUNT, 0,
// Start prior to this idle.
outputs_[output].idle_count - 1, 0);
if (record_status != sched_type_t::STATUS_OK)
return record_status;
}
Expand Down Expand Up @@ -3734,18 +3763,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
// do return an error on a time smaller than an input's current start time when we
// check for quantum end.
if (cur_time == 0) {
if (options_.mapping == MAP_AS_PREVIOUSLY) {
// XXX i#7023: We should instead store the simulator's time (whether
// passed in or our instr-based formula below) in the records and do away
// with wall-clock time for idle measurement. Either way, we should make
// it clear in the docs whether the user/simulator has to pass in the
// time on replay.
cur_time = get_time_micros();
} else {
// We add 1 to avoid an invalid value of 0.
cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
outputs_[output].idle_count;
}
// We add 1 to avoid an invalid value of 0.
cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
outputs_[output].idle_count;
}
// Invalid values for cur_time are checked below.
outputs_[output].cur_time->store(cur_time, std::memory_order_release);
Expand All @@ -3755,22 +3775,24 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
}
if (outputs_[output].waiting) {
if (options_.mapping == MAP_AS_PREVIOUSLY &&
outputs_[output].wait_start_time > 0) {
outputs_[output].idle_start_count >= 0) {
uint64_t duration = outputs_[output]
.record[outputs_[output].record_index->load(
std::memory_order_acquire)]
.value.idle_duration;
uint64_t now = get_output_time(output);
if (now - outputs_[output].wait_start_time < duration) {
uint64_t now = outputs_[output].idle_count;
if (now - outputs_[output].idle_start_count < duration) {
VPRINT(this, 4,
"next_record[%d]: elapsed %" PRIu64 " < duration %" PRIu64 "\n",
output, now - outputs_[output].wait_start_time, duration);
// XXX i#7023: This should always be STATUS_IDLE, right?
return sched_type_t::STATUS_WAIT;
output, now - outputs_[output].idle_start_count, duration);
++outputs_[output].idle_count;
return sched_type_t::STATUS_IDLE;
} else
outputs_[output].wait_start_time = 0;
outputs_[output].idle_start_count = -1;
}
VPRINT(this, 5, "next_record[%d]: need new input (cur=waiting)\n", output);
VPRINT(this, 5,
"next_record[%d]: need new input (cur=waiting; idles=%" PRIu64 ")\n",
output, outputs_[output].idle_count);
sched_type_t::stream_status_t res = pick_next_input(output, 0);
if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_SKIPPED)
return res;
Expand Down
8 changes: 6 additions & 2 deletions clients/drcachesim/scheduler/scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -1459,6 +1459,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
// Indicates that the output is idle. The value.idle_duration field holds
// a duration in microseconds.
IDLE,
// Indicates that the output is idle. The value.idle_duration field holds
// a duration as a count of idle records.
IDLE_BY_COUNT,
};
static constexpr int VERSION_CURRENT = 0;
schedule_record_t() = default;
Expand Down Expand Up @@ -1493,6 +1496,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
{
}
// For record_type_t::IDLE, the duration in microseconds of the idling.
// For record_type_t::IDLE_BY_COUNT, the duration as a count of idle records.
uint64_t idle_duration;
// Input stream ordinal of starting point, for non-IDLE types.
uint64_t start_instruction = 0;
Expand Down Expand Up @@ -1692,8 +1696,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
int64_t as_traced_cpuid = -1;
// Used for MAP_AS_PREVIOUSLY with live_replay_output_count_.
bool at_eof = false;
// Used for replaying wait periods.
uint64_t wait_start_time = 0;
// Used for recording and replaying idle periods.
int64_t idle_start_count = -1;
// Exported statistics. Currently all integers and cast to double on export.
std::vector<int64_t> stats =
std::vector<int64_t>(memtrace_stream_t::SCHED_STAT_TYPE_COUNT);
Expand Down

0 comments on commit 5f4a4c7

Please sign in to comment.