From e099287197197d1e5ad2502c56aef1fb54b4e911 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Fri, 19 Jul 2024 18:44:21 -0400 Subject: [PATCH] i#6882: Set drmemtrace scheduler init state to post-skip (#6894) For inputs with regions of interest that skip over their initial records, the drmemtrace scheduler was using the initial records for the initial timestamp and start-unscheduled state, which is incorrect. Solves this by applying an initial region skip at init time instead of waiting for an input to be scheduled. This requires shifting around when record file output is recorded. Adds a unit test that fails without this fix as it leaves one input permanently unscheduled and has the two inputs in the wrong order without using the skip timestamp. Fixes #6882 --- clients/drcachesim/scheduler/scheduler.cpp | 151 ++++++++++++------ clients/drcachesim/scheduler/scheduler.h | 22 ++- .../drcachesim/tests/scheduler_unit_tests.cpp | 136 +++++++++++++++- 3 files changed, 254 insertions(+), 55 deletions(-) diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 2b133f8eab6..37680dc06c0 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -1676,7 +1676,7 @@ scheduler_tmpl_t::process_next_initial_record( return false; } } else if (marker_type == TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE) { - if (options_.honor_direct_switches) { + if (options_.honor_direct_switches && options_.mapping != MAP_AS_PREVIOUSLY) { input.unscheduled = true; // Ignore this marker during regular processing. input.skip_next_unscheduled = true; @@ -1701,6 +1701,29 @@ scheduler_tmpl_t::get_initial_input_content( // output stream(s). for (size_t i = 0; i < inputs_.size(); ++i) { input_info_t &input = inputs_[i]; + + // If the input jumps to the middle immediately, do that now so we'll have + // the proper start timestamp. + if (!input.regions_of_interest.empty() && + // The docs say for replay we allow the user to pass ROI but ignore it. + // Maybe we should disallow it so we don't need checks like this? + options_.mapping != MAP_AS_PREVIOUSLY) { + RecordType record = create_invalid_record(); + sched_type_t::stream_status_t res = + advance_region_of_interest(/*output=*/-1, record, input); + if (res == sched_type_t::STATUS_SKIPPED) { + input.next_timestamp = + static_cast(input.reader->get_last_timestamp()); + // We can skip the rest of the loop here (the filetype will be there + // in the stream). + continue; + } + if (res != sched_type_t::STATUS_OK) { + VPRINT(this, 1, "Failed to advance initial ROI with status %d\n", res); + return sched_type_t::STATUS_ERROR_RANGE_INVALID; + } + } + bool found_filetype = false; bool found_timestamp = !gather_timestamps || input.next_timestamp > 0; if (process_next_initial_record(input, create_invalid_record(), found_filetype, @@ -2078,30 +2101,60 @@ scheduler_tmpl_t::advance_region_of_interest( return sched_type_t::STATUS_OK; VPRINT(this, 2, - "skipping from %" PRIu64 " to %" PRIu64 " instrs (%" PRIu64 + "skipping from %" PRId64 " to %" PRIu64 " instrs (%" PRIu64 " in reader) for ROI\n", cur_instr, cur_range.start_instruction, cur_range.start_instruction - cur_reader_instr - 1); if (options_.schedule_record_ostream != nullptr) { - sched_type_t::stream_status_t status = close_schedule_segment(output, input); - if (status != sched_type_t::STATUS_OK) - return status; - status = record_schedule_segment(output, schedule_record_t::SKIP, input.index, - cur_instr, cur_range.start_instruction); - if (status != sched_type_t::STATUS_OK) - return status; - status = record_schedule_segment(output, schedule_record_t::DEFAULT, input.index, - cur_range.start_instruction); - if (status != sched_type_t::STATUS_OK) - return status; + if (output >= 0) { + record_schedule_skip(output, input.index, cur_instr, + cur_range.start_instruction); + } // Else, will be done in set_cur_input once assigned to an output. } if (cur_range.start_instruction < cur_reader_instr) { // We do not support skipping without skipping over the pre-read: we would // need to extract from the queue. return sched_type_t::STATUS_INVALID; } - return skip_instructions(output, input, - cur_range.start_instruction - cur_reader_instr - 1); + return skip_instructions(input, cur_range.start_instruction - cur_reader_instr - 1); +} + +template +typename scheduler_tmpl_t::stream_status_t +scheduler_tmpl_t::record_schedule_skip(output_ordinal_t output, + input_ordinal_t input, + uint64_t start_instruction, + uint64_t stop_instruction) +{ + if (options_.schedule_record_ostream == nullptr) + return sched_type_t::STATUS_INVALID; + sched_type_t::stream_status_t status; + // Close any prior default record for this input. If we switched inputs, + // we'll already have closed the prior in set_cur_input(). + if (outputs_[output].record.back().type == schedule_record_t::DEFAULT && + outputs_[output].record.back().key.input == input) { + status = close_schedule_segment(output, inputs_[input]); + if (status != sched_type_t::STATUS_OK) + return status; + } + if (outputs_[output].record.size() == 1) { + // Replay doesn't handle starting out with a skip record: we need a + // start=0,stop=0 dummy entry to get things rolling at the start of + // an output's records, if we're the first record after the version. + assert(outputs_[output].record.back().type == schedule_record_t::VERSION); + status = record_schedule_segment(output, schedule_record_t::DEFAULT, input, 0, 0); + if (status != sched_type_t::STATUS_OK) + return status; + } + status = record_schedule_segment(output, schedule_record_t::SKIP, input, + start_instruction, stop_instruction); + if (status != sched_type_t::STATUS_OK) + return status; + status = record_schedule_segment(output, schedule_record_t::DEFAULT, input, + stop_instruction); + if (status != sched_type_t::STATUS_OK) + return status; + return sched_type_t::STATUS_OK; } template @@ -2124,8 +2177,7 @@ scheduler_tmpl_t::clear_input_queue(input_info_t &input) template typename scheduler_tmpl_t::stream_status_t -scheduler_tmpl_t::skip_instructions(output_ordinal_t output, - input_info_t &input, +scheduler_tmpl_t::skip_instructions(input_info_t &input, uint64_t skip_amount) { // reader_t::at_eof_ is true until init() is called. @@ -2161,28 +2213,11 @@ scheduler_tmpl_t::skip_instructions(output_ordinal_t out } } input.in_cur_region = true; - auto *stream = outputs_[output].stream; // We've documented that an output stream's ordinals ignore skips in its input // streams, so we do not need to remember the input's ordinals pre-skip and increase // our output's ordinals commensurately post-skip. - // If we skipped from the start we may not have seen the initial headers: - // use the input's cached copies. - // We set the version and filetype up front for outputs with - // an initial input, so we check a different field to detect a - // skip. - if (stream->cache_line_size_ == 0 || - // Check the version too as a fallback for inputs with no cache size. - stream->version_ == 0) { - stream->version_ = input.reader->get_version(); - stream->last_timestamp_ = input.reader->get_last_timestamp(); - stream->first_timestamp_ = input.reader->get_first_timestamp(); - stream->filetype_ = input.reader->get_filetype(); - stream->cache_line_size_ = input.reader->get_cache_line_size(); - stream->chunk_instr_count_ = input.reader->get_chunk_instr_count(); - stream->page_size_ = input.reader->get_page_size(); - } // We let the user know we've skipped. There's no discontinuity for the // first one so we do not insert a marker there (if we do want to insert one, // we need to update the view tool to handle a window marker as the very @@ -2229,6 +2264,10 @@ scheduler_tmpl_t::record_schedule_segment( // idle records quickly balloon the file. return sched_type_t::STATUS_OK; } + VPRINT(this, 4, + "recording out=%d type=%d input=%d start=%" PRIu64 " stop=%" PRIu64 + " time=%" PRIu64 "\n", + output, type, input, start_instruction, stop_instruction, timestamp); outputs_[output].record.emplace_back(type, input, start_instruction, stop_instruction, timestamp); // The stop is typically updated later in close_schedule_segment(). @@ -2275,9 +2314,11 @@ scheduler_tmpl_t::close_schedule_segment(output_ordinal_ input.index); ++instr_ord; } - VPRINT( - this, 3, "close_schedule_segment: input=%d start=%" PRId64 " stop=%" PRId64 "\n", - input.index, outputs_[output].record.back().value.start_instruction, instr_ord); + VPRINT(this, 3, + "close_schedule_segment: input=%d type=%d start=%" PRIu64 " stop=%" PRIu64 + "\n", + input.index, outputs_[output].record.back().type, + outputs_[output].record.back().value.start_instruction, instr_ord); // Check for empty default entries, except the starter 0,0 ones. assert(outputs_[output].record.back().type != schedule_record_t::DEFAULT || outputs_[output].record.back().value.start_instruction < instr_ord || @@ -2500,11 +2541,17 @@ scheduler_tmpl_t::set_cur_input(output_ordinal_t output, std::lock_guard lock(*inputs_[input].lock); - if (prev_input < 0 && outputs_[output].stream->filetype_ == 0) { + if (prev_input < 0 && outputs_[output].stream->version_ == 0) { // Set the version and filetype up front, to let the user query at init time - // as documented. - outputs_[output].stream->version_ = inputs_[input].reader->get_version(); - outputs_[output].stream->filetype_ = inputs_[input].reader->get_filetype(); + // as documented. Also set the other fields in case we did a skip for ROI. + auto *stream = outputs_[output].stream; + stream->version_ = inputs_[input].reader->get_version(); + stream->last_timestamp_ = inputs_[input].reader->get_last_timestamp(); + stream->first_timestamp_ = inputs_[input].reader->get_first_timestamp(); + stream->filetype_ = inputs_[input].reader->get_filetype(); + stream->cache_line_size_ = inputs_[input].reader->get_cache_line_size(); + stream->chunk_instr_count_ = inputs_[input].reader->get_chunk_instr_count(); + stream->page_size_ = inputs_[input].reader->get_page_size(); } if (inputs_[input].pid != INVALID_PID) { @@ -2541,14 +2588,26 @@ scheduler_tmpl_t::set_cur_input(output_ordinal_t output, } inputs_[input].prev_time_in_quantum = outputs_[output].cur_time; + if (options_.schedule_record_ostream != nullptr) { uint64_t instr_ord = get_instr_ordinal(inputs_[input]); VPRINT(this, 3, "set_cur_input: recording input=%d start=%" PRId64 "\n", input, instr_ord); - sched_type_t::stream_status_t status = - record_schedule_segment(output, schedule_record_t::DEFAULT, input, instr_ord); - if (status != sched_type_t::STATUS_OK) - return status; + if (!inputs_[input].regions_of_interest.empty() && + inputs_[input].cur_region == 0 && inputs_[input].in_cur_region && + (instr_ord == inputs_[input].regions_of_interest[0].start_instruction || + // The ord may be 1 less because we're still on the inserted timestamp. + instr_ord + 1 == inputs_[input].regions_of_interest[0].start_instruction)) { + // We skipped during init but didn't have an output for recording the skip: + // record it now. + record_schedule_skip(output, input, 0, + inputs_[input].regions_of_interest[0].start_instruction); + } else { + sched_type_t::stream_status_t status = record_schedule_segment( + output, schedule_record_t::DEFAULT, input, instr_ord); + if (status != sched_type_t::STATUS_OK) + return status; + } } return STATUS_OK; } @@ -2665,7 +2724,7 @@ scheduler_tmpl_t::pick_next_input_as_previously( "next_record[%d]: skipping from %" PRId64 " to %" PRId64 " in %d for schedule\n", output, cur_reader_instr, segment.stop_instruction, index); - auto status = skip_instructions(output, inputs_[index], + auto status = skip_instructions(inputs_[index], segment.stop_instruction - cur_reader_instr - 1 /*exclusive*/); // Increment the region to get window id markers with ordinals. diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index 5611b664180..a716fa84449 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -104,6 +104,7 @@ template class scheduler_tmpl_t { STATUS_ERROR_FILE_READ_FAILED, /**< Error: file read failed. */ STATUS_ERROR_NOT_IMPLEMENTED, /**< Error: not implemented. */ STATUS_ERROR_FILE_WRITE_FAILED, /**< Error: file write failed. */ + STATUS_ERROR_RANGE_INVALID, /**< Error: region of interest invalid. */ }; /** @@ -204,6 +205,12 @@ template class scheduler_tmpl_t { { } /** Convenience constructor for common usage. */ + input_thread_info_t(memref_tid_t tid, std::vector regions) + : tids(1, tid) + , regions_of_interest(regions) + { + } + /** Convenience constructor for common usage. */ input_thread_info_t(memref_tid_t tid, int priority) : tids(1, tid) , priority(priority) @@ -252,6 +259,13 @@ template class scheduler_tmpl_t { * separation, but it is not inserted prior to the first range. A * #dynamorio::drmemtrace::TRACE_TYPE_THREAD_EXIT record is inserted after the * final range. These ranges must be non-overlapping and in increasing order. + * + * Be aware that selecting a subset of code can remove inter-input + * communication steps that could be required for forward progress. + * For example, if selected subsets include #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE + * with no timeout but do not include a corresponding + * #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE for wakeup, an input could remain + * unscheduled. */ std::vector regions_of_interest; }; @@ -1518,7 +1532,13 @@ template class scheduler_tmpl_t { // Does a direct skip, unconditionally. // The caller must hold the input.lock. stream_status_t - skip_instructions(output_ordinal_t output, input_info_t &input, uint64_t skip_amount); + skip_instructions(input_info_t &input, uint64_t skip_amount); + + // Records an input skip in the output's recorded schedule. + // The caller must hold the input.lock. + stream_status_t + record_schedule_skip(output_ordinal_t output, input_ordinal_t input, + uint64_t start_instruction, uint64_t stop_instruction); scheduler_status_t read_and_instantiate_traced_schedule(); diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index b49103559e5..f1ce705749d 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -406,6 +406,7 @@ test_regions_bare_no_marker() /* clang-format off */ make_thread(1), make_pid(1), + make_marker(TRACE_MARKER_TYPE_PAGE_SIZE, 4096), // This would not happen in a real trace, only in tests. But it does // match a dynamic skip from the middle when an instruction has already // been read but not yet passed to the output stream. @@ -663,14 +664,9 @@ test_regions_too_far() std::vector sched_inputs; sched_inputs.emplace_back(std::move(readers)); sched_inputs[0].thread_modifiers.push_back(scheduler_t::input_thread_info_t(regions)); - if (scheduler.init(sched_inputs, 1, - scheduler_t::make_scheduler_serial_options(/*verbosity=*/4)) != - scheduler_t::STATUS_SUCCESS) - assert(false); - auto *stream = scheduler.get_stream(0); - memref_t memref; - scheduler_t::stream_status_t status = stream->next_record(memref); - assert(status == scheduler_t::STATUS_REGION_INVALID); + auto status = scheduler.init( + sched_inputs, 1, scheduler_t::make_scheduler_serial_options(/*verbosity=*/4)); + assert(status == scheduler_t::STATUS_ERROR_RANGE_INVALID); } static void @@ -4607,6 +4603,129 @@ test_unscheduled_initially() } } +static void +test_unscheduled_initially_roi() +{ +#ifdef HAS_ZIP + std::cerr + << "\n----------------\nTesting initially-unscheduled + time deps with ROI\n"; + static constexpr int NUM_OUTPUTS = 1; + static constexpr memref_tid_t TID_BASE = 100; + static constexpr memref_tid_t TID_A = TID_BASE + 0; + static constexpr memref_tid_t TID_B = TID_BASE + 1; + std::vector refs_A = { + make_thread(TID_A), + make_pid(1), + make_version(TRACE_ENTRY_VERSION), + make_timestamp(1001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + // A starts out unscheduled but we skip that. + // (In a real trace some other thread would have to wake up A: + // we omit that here to keep the test small.) + make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0), + make_timestamp(4202), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/101), + // We don't actually start until here. + make_instr(/*pc=*/102), + make_instr(/*pc=*/103), + make_exit(TID_A), + }; + std::vector refs_B = { + make_thread(TID_B), + make_pid(1), + make_version(TRACE_ENTRY_VERSION), + make_timestamp(3001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/201), + make_timestamp(4001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/202), + // B starts here, with a lower last timestamp than A. + make_instr(/*pc=*/203), + make_instr(/*pc=*/204), + make_exit(TID_B), + }; + // Instr counts are 1-based. + std::vector regions_A; + regions_A.emplace_back(2, 0); + std::vector regions_B; + regions_B.emplace_back(3, 0); + // B should run first due to the lower timestamp at its ROI despite A's + // start-of-trace timestamp being lower. + static const char *const CORE0_SCHED_STRING = "..BB...AA."; + + std::string record_fname = "tmp_test_unsched_ROI.zip"; + { + // Record. + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_A)), + std::unique_ptr(new mock_reader_t()), TID_A); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_B)), + std::unique_ptr(new mock_reader_t()), TID_B); + std::vector sched_inputs; + sched_inputs.emplace_back(std::move(readers)); + sched_inputs.back().thread_modifiers.push_back( + scheduler_t::input_thread_info_t(TID_A, regions_A)); + sched_inputs.back().thread_modifiers.push_back( + scheduler_t::input_thread_info_t(TID_B, regions_B)); + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_TIMESTAMPS, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/4); + zipfile_ostream_t outfile(record_fname); + sched_ops.schedule_record_ostream = &outfile; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + if (scheduler.write_recorded_schedule() != scheduler_t::STATUS_SUCCESS) + assert(false); + } + { + replay_file_checker_t checker; + zipfile_istream_t infile(record_fname); + std::string res = checker.check(&infile); + if (!res.empty()) + std::cerr << "replay file checker failed: " << res; + assert(res.empty()); + } + { + // Test replay as it has complexities with skip records. + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_A)), + std::unique_ptr(new mock_reader_t()), TID_A); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_B)), + std::unique_ptr(new mock_reader_t()), TID_B); + std::vector sched_inputs; + sched_inputs.emplace_back(std::move(readers)); + // The regions are ignored on replay so we do not specify them. + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_AS_PREVIOUSLY, + scheduler_t::DEPENDENCY_TIMESTAMPS, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/4); + zipfile_istream_t infile(record_fname); + sched_ops.schedule_replay_istream = &infile; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + } +#endif +} + static void test_kernel_switch_sequences() { @@ -5119,6 +5238,7 @@ test_main(int argc, const char *argv[]) test_unscheduled(); test_unscheduled_fallback(); test_unscheduled_initially(); + test_unscheduled_initially_roi(); test_kernel_switch_sequences(); test_random_schedule(); test_record_scheduler();