From d900f455493d1732cdb3f5e0de6744dd28b819a0 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Sun, 1 Sep 2024 22:40:06 -0400
Subject: [PATCH] i#6938 sched migrate: Add scheduler lock ownership asserts

Adds a std::mutex wrapper which stores the owner so we can assert on
lock ownership.

Adds lock ownership asserts to scheduler functions whose comments say
they require a lock on entry.

Fixes several cases of missing locks found by these asserts.

Issue: #6938
---
 clients/drcachesim/common/mutex_dbg_owned.h | 95 +++++++++++++++++++++
 clients/drcachesim/scheduler/scheduler.cpp  | 80 +++++++++++------
 clients/drcachesim/scheduler/scheduler.h    | 19 +++--
 3 files changed, 159 insertions(+), 35 deletions(-)
 create mode 100644 clients/drcachesim/common/mutex_dbg_owned.h
diff --git a/clients/drcachesim/common/mutex_dbg_owned.h b/clients/drcachesim/common/mutex_dbg_owned.h
new file mode 100644
index 00000000000..ae25d787558
--- /dev/null
+++ b/clients/drcachesim/common/mutex_dbg_owned.h
@@ -0,0 +1,95 @@
+/* **********************************************************
+ * Copyright (c) 2024 Google, Inc.  All rights reserved.
+ * **********************************************************/
+
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Google, Inc. nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/* mutex_dbg_owned.h: std::mutex plus an assertable owner in debug builds. */
+
+#ifndef _MUTEX_DBG_OWNED_H_
+#define _MUTEX_DBG_OWNED_H_ 1
+
+#include <mutex>
+#include <thread>
+
+namespace dynamorio {
+namespace drmemtrace {
+
+// A wrapper around std::mutex which adds an owner field for asserts on ownership
+// when a lock is required to be held by the caller (where
+// std::unique_lock::owns_lock() cannot easily be used).  The owner field is only
+// maintained when NDEBUG is not defined: i.e., it is targeted for asserts.
+class mutex_dbg_owned {
+public:
+    void
+    lock()
+    {
+        lock_.lock();
+#ifndef NDEBUG
+        owner_ = std::this_thread::get_id();
+#endif
+    }
+    bool
+    try_lock()
+    {
+#ifdef NDEBUG
+        return lock_.try_lock().
+#else
+        if (lock_.try_lock()) {
+            owner_ = std::this_thread::get_id();
+            return true;
+        }
+        return false;
+#endif
+    }
+    void
+    unlock()
+    {
+#ifndef NDEBUG
+        owner_ = std::thread::id();
+#endif
+        lock_.unlock();
+    }
+    // This query should only be called when the lock is required to be held
+    // as it is racy when the lock is not held.
+    bool
+    owned_by_cur_thread()
+    {
+        return owner_ == std::this_thread::get_id();
+    }
+
+private:
+    std::mutex lock_;
+    std::thread::id owner_;
+};
+
+} // namespace drmemtrace
+} // namespace dynamorio
+
+#endif /* _MUTEX_DBG_OWNED_H_ */
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index cce7f596396..793f4a149ec 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -53,6 +53,7 @@
 
 #include "memref.h"
 #include "memtrace_stream.h"
+#include "mutex_dbg_owned.h"
 #include "reader.h"
 #include "record_file_reader.h"
 #include "trace_entry.h"
@@ -579,7 +580,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::stream_t::next_record(RecordType &reco
         return res;
 
     // Update our memtrace_stream_t state.
-    std::lock_guard<std::mutex> guard(*input->lock);
+    std::lock_guard<mutex_dbg_owned> guard(*input->lock);
     if (!input->reader->is_record_synthetic())
         ++cur_ref_count_;
     if (scheduler_->record_type_is_instr_boundary(record, prev_record_))
@@ -630,7 +631,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::stream_t::unread_last_record()
     if (status != sched_type_t::STATUS_OK)
         return status;
     // Restore state.  We document that get_last_timestamp() is not updated.
-    std::lock_guard<std::mutex> guard(*input->lock);
+    std::lock_guard<mutex_dbg_owned> guard(*input->lock);
     if (!input->reader->is_record_synthetic())
         --cur_ref_count_;
     if (scheduler_->record_type_is_instr(record))
@@ -886,6 +887,8 @@ typename scheduler_tmpl_t<RecordType, ReaderType>::scheduler_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::set_initial_schedule(
     std::unordered_map<int, std::vector<int>> &workload2inputs)
 {
+    bool need_lock;
+    auto scoped_lock = acquire_scoped_sched_lock_if_necessary(need_lock);
     // Determine whether we need to read ahead in the inputs.  There are cases where we
     // do not want to do that as it would block forever if the inputs are not available
     // (e.g., online analysis IPC readers); it also complicates ordinals so we avoid it
@@ -1047,7 +1050,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::write_recorded_schedule()
 {
     if (options_.schedule_record_ostream == nullptr)
         return STATUS_ERROR_INVALID_PARAMETER;
-    std::lock_guard<std::mutex> guard(sched_lock_);
+    std::lock_guard<mutex_dbg_owned> guard(sched_lock_);
     for (int i = 0; i < static_cast<int>(outputs_.size()); ++i) {
         sched_type_t::stream_status_t status =
             record_schedule_segment(i, schedule_record_t::FOOTER, 0, 0, 0);
@@ -1762,6 +1765,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
     // output stream(s).
     for (size_t i = 0; i < inputs_.size(); ++i) {
         input_info_t &input = inputs_[i];
+        std::lock_guard<mutex_dbg_owned> lock(*input.lock);
 
         // If the input jumps to the middle immediately, do that now so we'll have
         // the proper start timestamp.
@@ -2127,6 +2131,7 @@ typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
     output_ordinal_t output, RecordType &record, input_info_t &input)
 {
+    assert(input.lock->owned_by_cur_thread());
     uint64_t cur_instr = get_instr_ordinal(input);
     uint64_t cur_reader_instr = input.reader->get_instruction_ordinal();
     assert(input.cur_region >= 0 &&
@@ -2208,6 +2213,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::record_schedule_skip(output_ordinal_t
                                                                uint64_t start_instruction,
                                                                uint64_t stop_instruction)
 {
+    assert(inputs_[input].lock->owned_by_cur_thread());
     if (options_.schedule_record_ostream == nullptr)
         return sched_type_t::STATUS_INVALID;
     sched_type_t::stream_status_t status;
@@ -2262,6 +2268,7 @@ typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::skip_instructions(input_info_t &input,
                                                             uint64_t skip_amount)
 {
+    assert(input.lock->owned_by_cur_thread());
     // reader_t::at_eof_ is true until init() is called.
     if (input.needs_init) {
         input.reader->init();
@@ -2337,6 +2344,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::record_schedule_segment(
     output_ordinal_t output, typename schedule_record_t::record_type_t type,
     input_ordinal_t input, uint64_t start_instruction, uint64_t stop_instruction)
 {
+    assert(type == schedule_record_t::VERSION || type == schedule_record_t::FOOTER ||
+           type == schedule_record_t::IDLE || inputs_[input].lock->owned_by_cur_thread());
     // We always use the current wall-clock time, as the time stored in the prior
     // next_record() call can be out of order across outputs and lead to deadlocks.
     uint64_t timestamp = get_time_micros();
@@ -2363,6 +2372,10 @@ scheduler_tmpl_t<RecordType, ReaderType>::close_schedule_segment(output_ordinal_
 {
     assert(output >= 0 && output < static_cast<output_ordinal_t>(outputs_.size()));
     assert(!outputs_[output].record.empty());
+    assert(outputs_[output].record.back().type == schedule_record_t::VERSION ||
+           outputs_[output].record.back().type == schedule_record_t::FOOTER ||
+           outputs_[output].record.back().type == schedule_record_t::IDLE ||
+           input.lock->owned_by_cur_thread());
     if (outputs_[output].record.back().type == schedule_record_t::SKIP) {
         // Skips already have a final stop value.
         return sched_type_t::STATUS_OK;
@@ -2413,6 +2426,7 @@ template <typename RecordType, typename ReaderType>
 bool
 scheduler_tmpl_t<RecordType, ReaderType>::ready_queue_empty()
 {
+    assert(!need_sched_lock() || sched_lock_.owned_by_cur_thread());
     return ready_priority_.empty();
 }
 
@@ -2420,6 +2434,7 @@ template <typename RecordType, typename ReaderType>
 void
 scheduler_tmpl_t<RecordType, ReaderType>::add_to_unscheduled_queue(input_info_t *input)
 {
+    assert(!need_sched_lock() || sched_lock_.owned_by_cur_thread());
     assert(input->unscheduled &&
            input->blocked_time == 0); // Else should be in regular queue.
     VPRINT(this, 4, "add_to_unscheduled_queue (pre-size %zu): input %d priority %d\n",
@@ -2432,6 +2447,7 @@ template <typename RecordType, typename ReaderType>
 void
 scheduler_tmpl_t<RecordType, ReaderType>::add_to_ready_queue(input_info_t *input)
 {
+    assert(!need_sched_lock() || sched_lock_.owned_by_cur_thread());
     if (input->unscheduled && input->blocked_time == 0) {
         add_to_unscheduled_queue(input);
         return;
@@ -2454,6 +2470,7 @@ typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
     output_ordinal_t for_output, input_info_t *&new_input)
 {
+    assert(!need_sched_lock() || sched_lock_.owned_by_cur_thread());
     std::set<input_info_t *> skipped;
     std::set<input_info_t *> blocked;
     input_info_t *res = nullptr;
@@ -2550,6 +2567,7 @@ bool
 scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *input,
                                                                 uint64_t &blocked_time)
 {
+    assert(input->lock->owned_by_cur_thread());
     uint64_t post_time = input->reader->get_last_timestamp();
     assert(input->processing_syscall || input->processing_maybe_blocking_syscall);
     if (input->reader->get_version() < TRACE_ENTRY_VERSION_FREQUENT_TIMESTAMPS) {
@@ -2580,6 +2598,7 @@ typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
                                                         input_ordinal_t input)
 {
+    assert(!need_sched_lock() || sched_lock_.owned_by_cur_thread());
     // XXX i#5843: Merge tracking of current inputs with ready_priority_ to better manage
     // the possible 3 states of each input (a live cur_input for an output stream, in
     // the ready_queue_, or at EOF) (4 states once we add i/o wait times).
@@ -2594,7 +2613,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
         }
         if (prev_input != input && options_.schedule_record_ostream != nullptr) {
             input_info_t &prev_info = inputs_[prev_input];
-            std::lock_guard<std::mutex> lock(*prev_info.lock);
+            std::lock_guard<mutex_dbg_owned> lock(*prev_info.lock);
             sched_type_t::stream_status_t status =
                 close_schedule_segment(output, prev_info);
             if (status != sched_type_t::STATUS_OK)
@@ -2617,11 +2636,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
 
     int prev_workload = -1;
     if (outputs_[output].prev_input >= 0 && outputs_[output].prev_input != input) {
-        std::lock_guard<std::mutex> lock(*inputs_[outputs_[output].prev_input].lock);
+        std::lock_guard<mutex_dbg_owned> lock(*inputs_[outputs_[output].prev_input].lock);
         prev_workload = inputs_[outputs_[output].prev_input].workload;
     }
 
-    std::lock_guard<std::mutex> lock(*inputs_[input].lock);
+    std::lock_guard<mutex_dbg_owned> lock(*inputs_[input].lock);
 
     if (inputs_[input].prev_output != INVALID_OUTPUT_ORDINAL &&
         inputs_[input].prev_output != output) {
@@ -2707,6 +2726,7 @@ typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
     output_ordinal_t output, input_ordinal_t &index)
 {
+    assert(!need_sched_lock() || sched_lock_.owned_by_cur_thread());
     if (outputs_[output].record_index + 1 >=
         static_cast<int>(outputs_[output].record.size())) {
         if (!outputs_[output].at_eof) {
@@ -2730,7 +2750,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
            output, index, get_instr_ordinal(inputs_[index]), segment.type,
            segment.value.start_instruction, segment.stop_instruction);
     {
-        std::lock_guard<std::mutex> lock(*inputs_[index].lock);
+        std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
         if (get_instr_ordinal(inputs_[index]) > segment.value.start_instruction) {
             VPRINT(this, 1,
                    "WARNING: next_record[%d]: input %d wants instr #%" PRId64
@@ -2795,7 +2815,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         }
     }
     if (segment.type == schedule_record_t::SYNTHETIC_END) {
-        std::lock_guard<std::mutex> lock(*inputs_[index].lock);
+        std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
         // We're past the final region of interest and we need to insert
         // a synthetic thread exit record.  We need to first throw out the
         // queued candidate record, if any.
@@ -2808,7 +2828,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         ++outputs_[output].record_index;
         return sched_type_t::STATUS_SKIPPED;
     } else if (segment.type == schedule_record_t::SKIP) {
-        std::lock_guard<std::mutex> lock(*inputs_[index].lock);
+        std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
         uint64_t cur_reader_instr = inputs_[index].reader->get_instruction_ordinal();
         VPRINT(this, 2,
                "next_record[%d]: skipping from %" PRId64 " to %" PRId64
@@ -2840,13 +2860,13 @@ scheduler_tmpl_t<RecordType, ReaderType>::need_sched_lock()
 }
 
 template <typename RecordType, typename ReaderType>
-std::unique_lock<std::mutex>
+std::unique_lock<mutex_dbg_owned>
 scheduler_tmpl_t<RecordType, ReaderType>::acquire_scoped_sched_lock_if_necessary(
     bool &need_lock)
 {
     need_lock = need_sched_lock();
-    auto scoped_lock = need_lock ? std::unique_lock<std::mutex>(sched_lock_)
-                                 : std::unique_lock<std::mutex>();
+    auto scoped_lock = need_lock ? std::unique_lock<mutex_dbg_owned>(sched_lock_)
+                                 : std::unique_lock<mutex_dbg_owned>();
     return scoped_lock;
 }
 
@@ -2894,7 +2914,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                     return res;
             } else if (options_.mapping == MAP_TO_ANY_OUTPUT) {
                 if (blocked_time > 0 && prev_index != INVALID_INPUT_ORDINAL) {
-                    std::lock_guard<std::mutex> lock(*inputs_[prev_index].lock);
+                    std::lock_guard<mutex_dbg_owned> lock(*inputs_[prev_index].lock);
                     if (inputs_[prev_index].blocked_time == 0) {
                         VPRINT(this, 2, "next_record[%d]: blocked time %" PRIu64 "\n",
                                output, blocked_time);
@@ -2906,7 +2926,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                     inputs_[prev_index].switch_to_input != INVALID_INPUT_ORDINAL) {
                     input_info_t *target = &inputs_[inputs_[prev_index].switch_to_input];
                     inputs_[prev_index].switch_to_input = INVALID_INPUT_ORDINAL;
-                    std::lock_guard<std::mutex> lock(*target->lock);
+                    std::lock_guard<mutex_dbg_owned> lock(*target->lock);
                     // XXX i#5843: Add an invariant check that the next timestamp of the
                     // target is later than the pre-switch-syscall timestamp?
                     if (ready_priority_.find(target)) {
@@ -2965,7 +2985,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                 } else if (ready_queue_empty() && blocked_time == 0) {
                     if (prev_index == INVALID_INPUT_ORDINAL)
                         return eof_or_idle(output, need_lock, prev_index);
-                    auto lock = std::unique_lock<std::mutex>(*inputs_[prev_index].lock);
+                    auto lock =
+                        std::unique_lock<mutex_dbg_owned>(*inputs_[prev_index].lock);
                     // If we can't go back to the current input, we're EOF or idle.
                     // TODO i#6959: We should go the EOF/idle route if
                     // inputs_[prev_index].unscheduled as otherwise we're ignoring its
@@ -3015,7 +3036,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
             } else if (options_.deps == DEPENDENCY_TIMESTAMPS) {
                 uint64_t min_time = std::numeric_limits<uint64_t>::max();
                 for (size_t i = 0; i < inputs_.size(); ++i) {
-                    std::lock_guard<std::mutex> lock(*inputs_[i].lock);
+                    std::lock_guard<mutex_dbg_owned> lock(*inputs_[i].lock);
                     if (!inputs_[i].at_eof && inputs_[i].next_timestamp > 0 &&
                         inputs_[i].next_timestamp < min_time) {
                         min_time = inputs_[i].next_timestamp;
@@ -3047,13 +3068,13 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
             } else
                 return sched_type_t::STATUS_INVALID;
             // reader_t::at_eof_ is true until init() is called.
-            std::lock_guard<std::mutex> lock(*inputs_[index].lock);
+            std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
             if (inputs_[index].needs_init) {
                 inputs_[index].reader->init();
                 inputs_[index].needs_init = false;
             }
         }
-        std::lock_guard<std::mutex> lock(*inputs_[index].lock);
+        std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
         if (inputs_[index].at_eof ||
             *inputs_[index].reader == *inputs_[index].reader_end) {
             VPRINT(this, 2, "next_record[%d]: input #%d at eof\n", output, index);
@@ -3086,6 +3107,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::process_marker(input_info_t &input,
                                                          trace_marker_type_t marker_type,
                                                          uintptr_t marker_value)
 {
+    assert(input.lock->owned_by_cur_thread());
     switch (marker_type) {
     case TRACE_MARKER_TYPE_SYSCALL:
         input.processing_syscall = true;
@@ -3196,7 +3218,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::process_marker(input_info_t &input,
             auto scoped_sched_lock =
                 acquire_scoped_sched_lock_if_necessary(need_sched_lock);
             input_info_t *target = &inputs_[target_idx];
-            std::lock_guard<std::mutex> lock(*target->lock);
+            std::lock_guard<mutex_dbg_owned> lock(*target->lock);
             if (target->unscheduled) {
                 target->unscheduled = false;
                 if (unscheduled_priority_.find(target)) {
@@ -3275,7 +3297,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         return eof_or_idle(output, /*hold_sched_lock=*/false, outputs_[output].cur_input);
     }
     input = &inputs_[outputs_[output].cur_input];
-    auto lock = std::unique_lock<std::mutex>(*input->lock);
+    auto lock = std::unique_lock<mutex_dbg_owned>(*input->lock);
     // Since we do not ask for a start time, we have to check for the first record from
     // each input and set the time here.
     if (input->prev_time_in_quantum == 0)
@@ -3329,7 +3351,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_SKIPPED)
                     return res;
                 input = &inputs_[outputs_[output].cur_input];
-                lock = std::unique_lock<std::mutex>(*input->lock);
+                lock = std::unique_lock<mutex_dbg_owned>(*input->lock);
                 if (res == sched_type_t::STATUS_SKIPPED) {
                     // Like for the ROI below, we need the queue or a de-ref.
                     input->needs_advance = false;
@@ -3540,7 +3562,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 if (res == sched_type_t::STATUS_WAIT)
                     return res;
                 input = &inputs_[outputs_[output].cur_input];
-                lock = std::unique_lock<std::mutex>(*input->lock);
+                lock = std::unique_lock<mutex_dbg_owned>(*input->lock);
                 continue;
             } else {
                 lock.lock();
@@ -3596,7 +3618,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal_t ou
         return sched_type_t::STATUS_INVALID;
     record = outinfo.last_record;
     input = &inputs_[outinfo.cur_input];
-    std::lock_guard<std::mutex> lock(*input->lock);
+    std::lock_guard<mutex_dbg_owned> lock(*input->lock);
     VPRINT(this, 4, "next_record[%d]: unreading last record, from %d\n", output,
            input->index);
     input->queue.push_back(outinfo.last_record);
@@ -3664,6 +3686,7 @@ template <typename RecordType, typename ReaderType>
 void
 scheduler_tmpl_t<RecordType, ReaderType>::mark_input_eof(input_info_t &input)
 {
+    assert(input.lock->owned_by_cur_thread());
     if (input.at_eof)
         return;
     input.at_eof = true;
@@ -3692,14 +3715,15 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
         assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof);
         return sched_type_t::STATUS_EOF;
     } else {
+        bool need_lock;
+        auto scoped_lock = hold_sched_lock
+            ? std::unique_lock<mutex_dbg_owned>()
+            : acquire_scoped_sched_lock_if_necessary(need_lock);
         if (options_.mapping == MAP_TO_ANY_OUTPUT) {
             // Workaround to avoid hangs when _SCHEDULE and/or _DIRECT_THREAD_SWITCH
             // directives miss their targets (due to running with a subset of the
             // original threads, or other scenarios) and we end up with no scheduled
             // inputs but a set of unscheduled inputs who will never be scheduled.
-            auto scoped_lock = hold_sched_lock
-                ? std::unique_lock<std::mutex>()
-                : std::unique_lock<std::mutex>(sched_lock_);
             VPRINT(this, 4,
                    "eof_or_idle output=%d live=%d unsched=%zu runq=%zu blocked=%d\n",
                    output, live_input_count_.load(std::memory_order_acquire),
@@ -3723,7 +3747,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
                                "queue\n");
                         while (!unscheduled_priority_.empty()) {
                             input_info_t *tomove = unscheduled_priority_.top();
-                            std::lock_guard<std::mutex> lock(*tomove->lock);
+                            std::lock_guard<mutex_dbg_owned> lock(*tomove->lock);
                             tomove->unscheduled = false;
                             ready_priority_.push(tomove);
                             unscheduled_priority_.pop();
@@ -3775,7 +3799,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_output_active(output_ordinal_t out
     outputs_[output].active = active;
     VPRINT(this, 2, "Output stream %d is now %s\n", output,
            active ? "active" : "inactive");
-    std::lock_guard<std::mutex> guard(sched_lock_);
+    std::lock_guard<mutex_dbg_owned> guard(sched_lock_);
     if (!active) {
         // Make the now-inactive output's input available for other cores.
         // This will reset its quantum too.
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index ab380781675..d1f46012d28 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -64,6 +64,7 @@
 #include "flexible_queue.h"
 #include "memref.h"
 #include "memtrace_stream.h"
+#include "mutex_dbg_owned.h"
 #include "reader.h"
 #include "record_file_reader.h"
 #include "speculator.h"
@@ -1251,7 +1252,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
 
     struct input_info_t {
         input_info_t()
-            : lock(new std::mutex)
+            : lock(new mutex_dbg_owned)
         {
         }
         // Returns whether the stream mixes threads (online analysis mode) yet
@@ -1270,7 +1271,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // We use a unique_ptr to make this moveable for vector storage.
         // For inputs not actively assigned to a core but sitting in the ready_queue,
         // sched_lock_ suffices to synchronize access.
-        std::unique_ptr<std::mutex> lock;
+        std::unique_ptr<mutex_dbg_owned> lock;
         // A tid can be duplicated across workloads so we need the pair of
         // workload index + tid to identify the original input.
         int workload = -1;
@@ -1631,7 +1632,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     uint64_t
     get_output_time(output_ordinal_t output);
 
-    // The caller must hold the lock for the input.
+    // The caller must hold the lock for the input unless it's not a real
+    // input index (it's not real for VERSION, FOOTER, and IDLE).
     stream_status_t
     record_schedule_segment(
         output_ordinal_t output, typename schedule_record_t::record_type_t type,
@@ -1642,7 +1644,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // max macro (even despite NOMINMAX defined above).
         uint64_t stop_instruction = (std::numeric_limits<uint64_t>::max)());
 
-    // The caller must hold the input.lock.
+    // The caller must hold the input.lock unless the record type
+    // is VERSION, FOOTER, or IDLE.
     stream_status_t
     close_schedule_segment(output_ordinal_t output, input_info_t &input);
 
@@ -1773,7 +1776,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     get_input_record_ordinal(output_ordinal_t output);
 
     // Returns the input instruction ordinal taking into account queued records.
-    // The caller must hold the input's lock.
+    // XXX: We need to clearly delineate where the input lock is needed: here
+    // we read the queue which shouldn't be changed by other threads; yet this
+    // routine used to claim it needed the input lock.
     uint64_t
     get_instr_ordinal(input_info_t &input);
 
@@ -1846,7 +1851,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     bool
     need_sched_lock();
 
-    std::unique_lock<std::mutex>
+    std::unique_lock<mutex_dbg_owned>
     acquire_scoped_sched_lock_if_necessary(bool &need_lock);
 
     // sched_lock_ must be held by the caller.
@@ -1896,7 +1901,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     // ready_counter_, unscheduled_priority_, and unscheduled_counter_.
     // This cannot be acquired while holding an input lock: it must
     // be acquired first, to avoid deadlocks.
-    std::mutex sched_lock_;
+    mutex_dbg_owned sched_lock_;
     // Inputs ready to be scheduled, sorted by priority and then timestamp if timestamp
     // dependencies are requested.  We use the timestamp delta from the first observed
     // timestamp in each workload in order to mix inputs from different workloads in the