i#6822 unscheduled: Add start-unscheduled support (DynamoRIO#6851)

Adds support for threads starting out in an "unscheduled" state. This is accomplished by always reading ahead in each input and looking for a TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE marker *before* the first instruction. Normally such a marker indicates the invocation of a system call and is after the system call instruction; for start-unscheduled threads it is present at the system call exit at the start of the trace. Changes the scheduler's virtual method process_next_initial_record() to make the booleans on finding certain markers input-and-output parameters and moves filetype marker handling and timestamp recording into the function. This also fixes a problem where an input's initial next_timestamp was replaced with the 2nd timestamp if a subclass read ahead. The extra readahead causes complexities elsewhere which are addressed: + The reader caches the last cpuid to use for synthetic recores on skipping. + Generalizes the existing scheduler handling of readahead (the "recorded_in_schedule" field in input_info_t) to store a count of pre-read instructions, which will generally be either 0 or 1. Adds a new internal interface get_instr_ordinal() to get the input reader's instruction ordinal minus the pre-read count. Changes raw2trace's virtual function process_marker_additionally() to process_marker() and moves all marker processing (including timestamps, which are not markers in the raw format) there, to better support subclasses inserting start-unscheduled markers and deciding whether to insert new markers either before or after pre-existing markers. Adds a scheduler test for the new feature. Issue: DynamoRIO#6822
chenhy0106 · Jun 22, 2024 · 304faf9 · 304faf9
1 parent 44d677f
commit 304faf9
Show file tree

Hide file tree

Showing 7 changed files with 288 additions and 92 deletions.
diff --git a/clients/drcachesim/reader/reader.cpp b/clients/drcachesim/reader/reader.cpp
@@ -347,7 +347,9 @@ reader_t::process_input_entry()
                 if (first_timestamp_ == 0)
                     first_timestamp_ = last_timestamp_;
             }
-        } else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_VERSION)
+        } else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID)
+            last_cpuid_ = cur_ref_.marker.marker_value;
+        else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_VERSION)
             version_ = cur_ref_.marker.marker_value;
         else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_FILETYPE) {
             filetype_ = cur_ref_.marker.marker_value;
@@ -464,6 +466,11 @@ reader_t::skip_instructions_with_timestamp(uint64_t stop_instruction_count)
         timestamp.addr = static_cast<addr_t>(last_timestamp_);
     }
     trace_entry_t cpu = {};
+    if (last_cpuid_ != 0) {
+        cpu.type = TRACE_TYPE_MARKER;
+        cpu.size = TRACE_MARKER_TYPE_CPU_ID;
+        cpu.addr = static_cast<addr_t>(last_cpuid_);
+    }
     trace_entry_t next_instr = {};
     bool prev_was_record_ord = false;
     bool found_real_timestamp = false;

diff --git a/clients/drcachesim/reader/reader.h b/clients/drcachesim/reader/reader.h
@@ -254,6 +254,7 @@ class reader_t : public std::iterator<std::input_iterator_tag, memref_t>,
     uint64_t last_timestamp_ = 0;
     uint64_t first_timestamp_ = 0;
     trace_entry_t *input_entry_ = nullptr;
+    uint64_t last_cpuid_ = 0;
     // Remember top-level headers for the memtrace_stream_t interface.
     uint64_t version_ = 0;
     uint64_t filetype_ = 0;

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
@@ -1251,7 +1251,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         bool at_eof = false;
         uintptr_t next_timestamp = 0;
         uint64_t instrs_in_quantum = 0;
-        bool recorded_in_schedule = false;
+        int instrs_pre_read = 0;
         // This is a per-workload value, stored in each input for convenience.
         uint64_t base_timestamp = 0;
         // This equals 'options_.deps == DEPENDENCY_TIMESTAMPS', stored here for
@@ -1464,7 +1464,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     // the two bool parameters are what the return value should be based on.
     virtual bool
     process_next_initial_record(input_info_t &input, RecordType record,
-                                bool found_filetype, bool found_timestamp);
+                                bool &found_filetype, bool &found_timestamp);
 
     // Opens up all the readers for each file in 'path' which may be a directory.
     // Returns a map of the thread id of each file to its index in inputs_.
@@ -1612,6 +1612,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     bool
     record_type_is_marker(RecordType record, trace_marker_type_t &type, uintptr_t &value);
 
+    // Returns false for memref_t; for trace_entry_t returns true for the _HEADER,
+    // _THREAD, and _PID record types.
+    bool
+    record_type_is_non_marker_header(RecordType record);
+
     // If the given record is a timestamp, returns true and its fields.
     bool
     record_type_is_timestamp(RecordType record, uintptr_t &value);
@@ -1691,6 +1696,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     uint64_t
     get_input_record_ordinal(output_ordinal_t output);
 
+    // Returns the input instruction ordinal taking into account queued records.
+    // The caller must hold the input's lock.
+    uint64_t
+    get_instr_ordinal(input_info_t &input);
+
     // Returns the first timestamp for the current input stream interface for the
     // 'output_ordinal'-th output stream.
     uint64_t

diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -3512,10 +3512,11 @@ test_replay_as_traced_from_file(const char *testdir)
     static const char *const SCHED_STRING =
         "Core #0: 1257602 \nCore #1: 1257600 \n"
         "Core #2: 1257599 => 1257604 @ <366987,87875,13331862029895453> "
-        // The ordinal is really 1 ("<1,0,0>") but with the scheduler's readahead
-        // it becomes 2; easier to just check for that as trying to avoid readahead
-        // causes other problems (i#xxxx).
-        "(<366986,87875,13331862029895453> => <2,0,0>) \n"
+        // The ordinal is really 1 ("<1,0,0>") but with the scheduler's readahead we have
+        // more records and a timestamp; easier to just check for that as trying to avoid
+        // readahead causes other problems.  Maybe the scheduler should have mediator
+        // getters for the record and timestamp that hide the readahead.
+        "(<366986,87875,13331862029895453> => <6,0,13331862029900572>) \n"
         "Core #3: 1257596 \nCore #4: 1257603 \nCore #5: 1257601 \nCore #6: 1257598 \n";
     static constexpr int NUM_OUTPUTS = 7; // Matches the actual trace's core footprint.
     scheduler_t scheduler;
@@ -4423,6 +4424,123 @@ test_unscheduled_fallback()
     }
 }
 
+static void
+test_unscheduled_initially()
+{
+    std::cerr << "\n----------------\nTesting initially-unscheduled threads\n";
+    static constexpr int NUM_OUTPUTS = 1;
+    static constexpr int BLOCK_LATENCY = 100;
+    static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
+    static constexpr uint64_t BLOCK_TIME_MAX = 500;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr memref_tid_t TID_A = TID_BASE + 0;
+    static constexpr memref_tid_t TID_B = TID_BASE + 1;
+    std::vector<trace_entry_t> refs_A = {
+        make_thread(TID_A),
+        make_pid(1),
+        make_version(TRACE_ENTRY_VERSION),
+        // A has the earliest timestamp and would start.
+        make_timestamp(1001),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        // A starts out unscheduled though.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0),
+        make_timestamp(4202),
+        // B makes us scheduled again.
+        make_instr(/*pc=*/102),
+        make_exit(TID_A),
+    };
+    std::vector<trace_entry_t> refs_B = {
+        make_thread(TID_B),
+        make_pid(1),
+        make_version(TRACE_ENTRY_VERSION),
+        // B runs 2nd by timestamp.
+        make_timestamp(3001),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(/*pc=*/200),
+        make_timestamp(3002),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        // B makes a long-latency blocking syscall, testing whether
+        // A is really unscheduled.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_timestamp(7002),
+        make_instr(/*pc=*/201),
+        // B makes A no longer unscheduled.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, TID_A),
+        make_timestamp(7021),
+        make_instr(/*pc=*/202),
+        make_exit(TID_B),
+    };
+    {
+        // Test the defaults with direct switches enabled.
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_A)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_A);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_B)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_B);
+        // We have an idle period while B is blocked and A unscheduled.
+        static const char *const CORE0_SCHED_STRING =
+            "...B.....________________________________________B....B......A.";
+
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/3);
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_max = BLOCK_TIME_MAX;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+    }
+    {
+        // Test disabling direct switches.
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_A)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_A);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_B)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_B);
+        // A runs first as it being unscheduled is ignored.
+        static const char *const CORE0_SCHED_STRING =
+            ".....A....B.....________________________________________B....B.";
+
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/3);
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_max = BLOCK_TIME_MAX;
+        sched_ops.honor_direct_switches = false;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+    }
+}
+
 static void
 test_kernel_switch_sequences()
 {
@@ -4935,6 +5053,7 @@ test_main(int argc, const char *argv[])
     test_direct_switch();
     test_unscheduled();
     test_unscheduled_fallback();
+    test_unscheduled_initially();
     test_kernel_switch_sequences();
     test_random_schedule();
     test_record_scheduler();

diff --git a/clients/drcachesim/tracer/raw2trace.cpp b/clients/drcachesim/tracer/raw2trace.cpp
@@ -748,9 +748,7 @@ raw2trace_t::process_offline_entry(raw2trace_thread_data_t *tdata,
                 return false;
             trace_marker_type_t marker_type =
                 static_cast<trace_marker_type_t>(in_entry->extended.valueB);
-            buf += trace_metadata_writer_t::write_marker(buf, marker_type, marker_val);
-            if (!process_marker_additionally(tdata, marker_type, marker_val, buf,
-                                             flush_decode_cache))
+            if (!process_marker(tdata, marker_type, marker_val, buf, flush_decode_cache))
                 return false;
             // If there is currently a delayed branch that has not been emitted yet,
             // delay most markers since intra-block markers can cause issues with
@@ -836,11 +834,21 @@ raw2trace_t::process_offline_entry(raw2trace_thread_data_t *tdata,
 }
 
 bool
-raw2trace_t::process_marker_additionally(raw2trace_thread_data_t *tdata,
-                                         trace_marker_type_t marker_type,
-                                         uintptr_t marker_val, byte *&buf,
-                                         DR_PARAM_OUT bool *flush_decode_cache)
-{
+raw2trace_t::process_marker(raw2trace_thread_data_t *tdata,
+                            trace_marker_type_t marker_type, uintptr_t marker_val,
+                            byte *&buf, DR_PARAM_OUT bool *flush_decode_cache)
+{
+    if (marker_type == TRACE_MARKER_TYPE_TIMESTAMP) {
+        uint64 stamp = static_cast<uint64>(marker_val);
+        VPRINT(2, "Thread %u timestamp 0x" ZHEX64_FORMAT_STRING "\n",
+               static_cast<uint>(tdata->tid), stamp);
+        accumulate_to_statistic(tdata, RAW2TRACE_STAT_EARLIEST_TRACE_TIMESTAMP, stamp);
+        accumulate_to_statistic(tdata, RAW2TRACE_STAT_LATEST_TRACE_TIMESTAMP, stamp);
+        tdata->last_timestamp_ = stamp;
+        buf += trace_metadata_writer_t::write_timestamp(buf, marker_val);
+        return true;
+    }
+    buf += trace_metadata_writer_t::write_marker(buf, marker_type, marker_val);
     if (marker_type == TRACE_MARKER_TYPE_KERNEL_EVENT) {
         log(4, "Signal/exception between bbs\n");
         // An rseq side exit may next hit a signal which is then the
@@ -1293,16 +1301,17 @@ raw2trace_t::process_next_thread_buffer(raw2trace_thread_data_t *tdata,
         // when it calls get_next_entry() on its own.
         offline_entry_t entry = *in_entry;
         if (entry.timestamp.type == OFFLINE_TYPE_TIMESTAMP) {
-            VPRINT(2, "Thread %u timestamp 0x" ZHEX64_FORMAT_STRING "\n",
-                   (uint)tdata->tid, (uint64)entry.timestamp.usec);
-            accumulate_to_statistic(tdata, RAW2TRACE_STAT_EARLIEST_TRACE_TIMESTAMP,
-                                    static_cast<uint64>(entry.timestamp.usec));
-            accumulate_to_statistic(tdata, RAW2TRACE_STAT_LATEST_TRACE_TIMESTAMP,
-                                    static_cast<uint64>(entry.timestamp.usec));
-            byte *buf = buf_base +
-                trace_metadata_writer_t::write_timestamp(buf_base,
-                                                         (uintptr_t)entry.timestamp.usec);
-            tdata->last_timestamp_ = entry.timestamp.usec;
+            // Give subclasses a chance for further action on a timestamp by
+            // putting our processing as thought it were a marker at the raw level.
+            bool flush_decode_cache = false;
+            byte *buf = buf_base;
+            uintptr_t value = static_cast<uintptr_t>(entry.timestamp.usec);
+            if (!process_marker(tdata, TRACE_MARKER_TYPE_TIMESTAMP, value, buf,
+                                &flush_decode_cache)) {
+                return false;
+            }
+            if (flush_decode_cache)
+                decode_cache_[tdata->worker].clear();
             if ((uint)(buf - buf_base) >= WRITE_BUFFER_SIZE) {
                 tdata->error = "Too many entries";
                 return false;

diff --git a/clients/drcachesim/tracer/raw2trace.h b/clients/drcachesim/tracer/raw2trace.h
@@ -1101,14 +1101,14 @@ class raw2trace_t {
                           DR_PARAM_OUT bool *flush_decode_cache);
 
     /**
-     * Performs any additional actions for the marker "marker_type" with value
-     * "marker_val", beyond writing out a marker record.  New records can be written to
-     * "buf".  Returns whether successful.
+     * Performs processing actions for the marker "marker_type" with value
+     * "marker_val", including writing out a marker record.  Further records can also
+     * be written to "buf".  Returns whether successful.
      */
     virtual bool
-    process_marker_additionally(raw2trace_thread_data_t *tdata,
-                                trace_marker_type_t marker_type, uintptr_t marker_val,
-                                byte *&buf, DR_PARAM_OUT bool *flush_decode_cache);
+    process_marker(raw2trace_thread_data_t *tdata, trace_marker_type_t marker_type,
+                   uintptr_t marker_val, byte *&buf,
+                   DR_PARAM_OUT bool *flush_decode_cache);
     /**
      * Read the header of a thread, by calling get_next_entry() successively to
      * populate the header values. The timestamp field is populated only