diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h
index 5b26180ba5c..6b1a66e2b13 100644
--- a/clients/drcachesim/common/trace_entry.h
+++ b/clients/drcachesim/common/trace_entry.h
@@ -567,10 +567,21 @@ typedef enum {
      * #TRACE_MARKER_TYPE_SYSCALL and #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL markers)
      * that causes an immediate switch to another thread on the same core (with the
      * current thread entering an unscheduled state), bypassing the kernel scheduler's
-     * normal dynamic switch code based on run queues.  The marker value holds the
-     * thread id of the target thread.  This should generally always be after a
-     * #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker as such a switch always
-     * has a chance of blocking if the target needs to be migrated.
+     * normal dynamic switch code based on run queues.  The marker value holds the thread
+     * id of the target thread.  The current thread will remain unschedulable
+     * indefinitely unless another thread resumes it with either
+     * #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH or #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE;
+     * or, if a #TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT marker is present, the thread will
+     * become schedulable when that timeout expires.  This marker provides a mechanism to
+     * model these semantics while abstracting away whether the underlying system call is
+     * a custom kernel extension or a variant of "futex" or other selective wait-notify
+     * scheme.  This marker should generally always be after a
+     * #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker as such a switch always has a
+     * chance of blocking the source thread.  See also
+     * #TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, and
+     * #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE.  The scheduler only models this behavior when
+     * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.honor_direct_switches
+     * is true.
      */
     TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH,
 
@@ -627,6 +638,51 @@ typedef enum {
      */
     TRACE_MARKER_TYPE_VECTOR_LENGTH,
 
+    /**
+     * This marker is emitted prior to a system call (but after the system call's
+     * #TRACE_MARKER_TYPE_SYSCALL and #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL markers)
+     * that causes the current thread to become unschedulable (removed from all queues of
+     * runnable threads).  The thread will remain unschedulable indefinitely unless
+     * another thread resumes it with either #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH or
+     * #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE; or, if a
+     * #TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT marker is present, the thread will become
+     * schedulable when that timeout expires.  This marker provides a mechanism to model
+     * these semantics while abstracting away whether the underlying system call is a
+     * custom kernel extension or a variant of "futex" or other selective wait-notify
+     * scheme.  This marker should generally always be after a
+     * #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker as becoming unschedulable is a
+     * form of blocking and results in a context switch.  The scheduler only models this
+     * behavior when
+     * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.honor_direct_switches
+     * is true.
+     */
+    TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE,
+
+    /**
+     * This marker is emitted prior to a system call (but after the system call's
+     * #TRACE_MARKER_TYPE_SYSCALL marker) that causes a target thread identified in the
+     * marker value to become schedulable again if it were currently unschedulable or if
+     * it is not currently unschedulable to *not* become unschedulable on its next action
+     * that would otherwise do so.  See also #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE and
+     * #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH.  This marker provides a mechanism to
+     * model these semantics while abstracting away whether the underlying system call is
+     * a custom kernel extension or a variant of "futex" or other selective wait-notify
+     * scheme.  The scheduler only models this behavior when
+     * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.honor_direct_switches
+     * is true.
+     */
+    TRACE_MARKER_TYPE_SYSCALL_SCHEDULE,
+
+    /**
+     * This marker is emitted prior to a system call (but after the system call's
+     * #TRACE_MARKER_TYPE_SYSCALL and #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL markers)
+     * which also has a #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH or
+     * #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE marker.  This indicates a timeout provided
+     * by the application after which the thread will become schedulable again.  The
+     * marker value holds the timeout duration in microseconds.
+     */
+    TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT,
+
     // ...
     // These values are reserved for future built-in marker types.
     // ...
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index bbdad729e59..b7bf1170d32 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -931,15 +931,22 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_initial_schedule(
                     set_cur_input(i, INVALID_INPUT_ORDINAL);
             }
         } else {
-            // Just take the 1st N inputs (even if all from the same workload).
+            // Just take the 1st #outputs of schedulable (i.e., not "unscheduled")
+            // inputs (even if all from the same workload).
+            input_ordinal_t input = 0;
             for (int i = 0; i < static_cast<output_ordinal_t>(outputs_.size()); ++i) {
-                if (i < static_cast<input_ordinal_t>(inputs_.size()))
-                    set_cur_input(i, i);
+                while (input < static_cast<input_ordinal_t>(inputs_.size()) &&
+                       inputs_[input].unscheduled) {
+                    add_to_ready_queue(&inputs_[input]);
+                    ++input;
+                }
+                if (input < static_cast<input_ordinal_t>(inputs_.size()))
+                    set_cur_input(i, input);
                 else
                     set_cur_input(i, INVALID_INPUT_ORDINAL);
+                ++input;
             }
-            for (int i = static_cast<output_ordinal_t>(outputs_.size());
-                 i < static_cast<input_ordinal_t>(inputs_.size()); ++i) {
+            for (int i = input; i < static_cast<input_ordinal_t>(inputs_.size()); ++i) {
                 add_to_ready_queue(&inputs_[i]);
             }
         }
@@ -1479,6 +1486,12 @@ bool
 scheduler_tmpl_t<RecordType, ReaderType>::process_next_initial_record(
     input_info_t &input, RecordType record, bool found_filetype, bool found_timestamp)
 {
+    // TODO i#6822: Always look ahead until the first instruction, looking
+    // for threads that start out with an exit from an UNSCHEDULE or DIRECT
+    // syscall so we can have them start out unscheduled: though we can't
+    // easily know whether there was a timeout unless we read way ahead past
+    // signal handlers until the syscall exits to look for -ETIMEDOUT.
+    // Should we have raw2trace look for that?
     return !(found_filetype && found_timestamp);
 }
 
@@ -1512,7 +1525,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
                 if (record_type_is_marker(record, marker_type, marker_value) &&
                     marker_type == TRACE_MARKER_TYPE_FILETYPE) {
                     found_filetype = true;
-                    VPRINT(this, 2, "Input %zu filetype %zu\n", i, marker_value);
+                    VPRINT(this, 2, "Input %zu filetype %zu from queue\n", i,
+                           marker_value);
                 }
                 if (record_type_is_timestamp(record, input.next_timestamp))
                     found_timestamp = true;
@@ -1561,7 +1575,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
                 // synthetic trace in a test) or we may have to read thousands of records
                 // to find it if it were somehow missing, which we do not want to do.  We
                 // assume our queued records are few and do not include instructions when
-                // we skip (see skip_instrutions()).  Thus, we abort with an error.
+                // we skip (see skip_instructions()).  Thus, we abort with an error.
                 if (record_type_is_instr(record))
                     break;
                 input.queue.push_back(record);
@@ -1833,7 +1847,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
                input.cur_region);
         if (input.cur_region >= static_cast<int>(input.regions_of_interest.size())) {
             if (input.at_eof)
-                return eof_or_idle(output);
+                return eof_or_idle(output, /*hold_sched_lock=*/false);
             else {
                 // We let the user know we're done.
                 if (options_.schedule_record_ostream != nullptr) {
@@ -2070,10 +2084,26 @@ scheduler_tmpl_t<RecordType, ReaderType>::ready_queue_empty()
     return ready_priority_.empty();
 }
 
+template <typename RecordType, typename ReaderType>
+void
+scheduler_tmpl_t<RecordType, ReaderType>::add_to_unscheduled_queue(input_info_t *input)
+{
+    assert(input->unscheduled &&
+           input->blocked_time == 0); // Else should be in regular queue.
+    VPRINT(this, 4, "add_to_unscheduled_queue (pre-size %zu): input %d priority %d\n",
+           unscheduled_priority_.size(), input->index, input->priority);
+    input->queue_counter = ++unscheduled_counter_;
+    unscheduled_priority_.push(input);
+}
+
 template <typename RecordType, typename ReaderType>
 void
 scheduler_tmpl_t<RecordType, ReaderType>::add_to_ready_queue(input_info_t *input)
 {
+    if (input->unscheduled && input->blocked_time == 0) {
+        add_to_unscheduled_queue(input);
+        return;
+    }
     VPRINT(
         this, 4,
         "add_to_ready_queue (pre-size %zu): input %d priority %d timestamp delta %" PRIu64
@@ -2105,6 +2135,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
             res = ready_priority_.top();
             ready_priority_.pop();
         }
+        assert(!res->unscheduled ||
+               res->blocked_time > 0); // Should be in unscheduled_priority_.
         if (res->binding.empty() || res->binding.find(for_output) != res->binding.end()) {
             // For blocked inputs, as we don't have interrupts or other regular
             // control points we only check for being unblocked when an input
@@ -2144,9 +2176,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
         static int heartbeat;
         // We are ok with races as the cadence is approximate.
         if (++heartbeat % 500 == 0) {
-            VPRINT(this, 1, "heartbeat[%d] %zd in queue; %d blocked => %d %d\n",
+            VPRINT(this, 1,
+                   "heartbeat[%d] %zd in queue; %d blocked; %zd unscheduled => %d %d\n",
                    for_output, ready_priority_.size(), num_blocked_,
-                   res == nullptr ? -1 : res->index, status);
+                   unscheduled_priority_.size(), res == nullptr ? -1 : res->index,
+                   status);
         }
     });
     if (res != nullptr) {
@@ -2156,11 +2190,29 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
                for_output, ready_priority_.size(), res->index, res->priority,
                res->reader->get_last_timestamp() - res->base_timestamp);
         res->blocked_time = 0;
+        res->unscheduled = false;
     }
     new_input = res;
     return status;
 }
 
+template <typename RecordType, typename ReaderType>
+uint64_t
+scheduler_tmpl_t<RecordType, ReaderType>::scale_blocked_time(uint64_t initial_time) const
+{
+    uint64_t scaled = static_cast<uint64_t>(static_cast<double>(initial_time) *
+                                            options_.block_time_scale);
+    if (scaled > options_.block_time_max) {
+        // We have a max to avoid outlier latencies that are already a second or
+        // more from scaling up to tens of minutes.  We assume a cap is representative
+        // as the outliers likely were not part of key dependence chains.  Without a
+        // cap the other threads all finish and the simulation waits for tens of
+        // minutes further for a couple of outliers.
+        scaled = options_.block_time_max;
+    }
+    return scaled;
+}
+
 template <typename RecordType, typename ReaderType>
 bool
 scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *input,
@@ -2181,16 +2233,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *in
     uint64_t threshold = input->processing_maybe_blocking_syscall
         ? options_.blocking_switch_threshold
         : options_.syscall_switch_threshold;
-    blocked_time =
-        static_cast<uint64_t>(static_cast<double>(latency) * options_.block_time_scale);
-    if (blocked_time > options_.block_time_max) {
-        // We have a max to avoid outlier latencies that are already a second or
-        // more from scaling up to tens of minutes.  We assume a cap is representative
-        // as the outliers likely were not part of key dependence chains.  Without a
-        // cap the other threads all finish and the simulation waits for tens of
-        // minutes further for a couple of outliers.
-        blocked_time = options_.block_time_max;
-    }
+    blocked_time = scale_blocked_time(latency);
     VPRINT(this, 3,
            "input %d %ssyscall latency %" PRIu64 " * scale %5.1f => blocked time %" PRIu64
            "\n",
@@ -2214,8 +2257,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
     int prev_input = outputs_[output].cur_input;
     if (prev_input >= 0) {
         if (options_.mapping == MAP_TO_ANY_OUTPUT && prev_input != input &&
-            !inputs_[prev_input].at_eof)
+            !inputs_[prev_input].at_eof) {
             add_to_ready_queue(&inputs_[prev_input]);
+        }
         if (prev_input != input && options_.schedule_record_ostream != nullptr) {
             input_info_t &prev_info = inputs_[prev_input];
             std::lock_guard<std::mutex> lock(*prev_info.lock);
@@ -2320,7 +2364,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
             outputs_[output].at_eof = true;
             live_replay_output_count_.fetch_add(-1, std::memory_order_release);
         }
-        return eof_or_idle(output);
+        return eof_or_idle(output, need_sched_lock());
     }
     const schedule_record_t &segment =
         outputs_[output].record[outputs_[output].record_index + 1];
@@ -2441,22 +2485,41 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
     return sched_type_t::STATUS_OK;
 }
 
+template <typename RecordType, typename ReaderType>
+bool
+scheduler_tmpl_t<RecordType, ReaderType>::need_sched_lock()
+{
+    return options_.mapping == MAP_TO_ANY_OUTPUT || options_.mapping == MAP_AS_PREVIOUSLY;
+}
+
+template <typename RecordType, typename ReaderType>
+std::unique_lock<std::mutex>
+scheduler_tmpl_t<RecordType, ReaderType>::acquire_scoped_sched_lock_if_necessary(
+    bool &need_lock)
+{
+    need_lock = need_sched_lock();
+    auto scoped_lock = need_lock ? std::unique_lock<std::mutex>(sched_lock_)
+                                 : std::unique_lock<std::mutex>();
+    return scoped_lock;
+}
+
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t output,
                                                           uint64_t blocked_time)
 {
     sched_type_t::stream_status_t res = sched_type_t::STATUS_OK;
-    bool need_lock =
-        options_.mapping == MAP_TO_ANY_OUTPUT || options_.mapping == MAP_AS_PREVIOUSLY;
-    auto scoped_lock = need_lock ? std::unique_lock<std::mutex>(sched_lock_)
-                                 : std::unique_lock<std::mutex>();
+    bool need_lock;
+    auto scoped_lock = acquire_scoped_sched_lock_if_necessary(need_lock);
     input_ordinal_t prev_index = outputs_[output].cur_input;
     input_ordinal_t index = INVALID_INPUT_ORDINAL;
     int iters = 0;
     while (true) {
         ++iters;
         if (index < 0) {
+            // XXX i#6831: Refactor to use subclasses or templates to specialize
+            // scheduler code based on mapping options, to avoid these top-level
+            // conditionals in many functions?
             if (options_.mapping == MAP_AS_PREVIOUSLY) {
                 res = pick_next_input_as_previously(output, index);
                 VDO(this, 2, {
@@ -2491,20 +2554,21 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                                output, blocked_time);
                         inputs_[prev_index].blocked_time = blocked_time;
                         inputs_[prev_index].blocked_start_time = get_output_time(output);
-                    } else {
-                        // If we looped we could have the same prev_index.
-                        assert(iters > 1);
                     }
                 }
                 if (prev_index != INVALID_INPUT_ORDINAL &&
                     inputs_[prev_index].switch_to_input != INVALID_INPUT_ORDINAL) {
                     input_info_t *target = &inputs_[inputs_[prev_index].switch_to_input];
                     inputs_[prev_index].switch_to_input = INVALID_INPUT_ORDINAL;
+                    std::lock_guard<std::mutex> lock(*target->lock);
                     // XXX i#5843: Add an invariant check that the next timestamp of the
                     // target is later than the pre-switch-syscall timestamp?
                     if (ready_priority_.find(target)) {
-                        VPRINT(this, 2, "next_record[%d]: direct switch to input %d\n",
-                               output, target->index);
+                        VPRINT(this, 2,
+                               "next_record[%d]: direct switch from input %d to input %d "
+                               "@%" PRIu64 "\n",
+                               output, prev_index, target->index,
+                               inputs_[prev_index].reader->get_last_timestamp());
                         ready_priority_.erase(target);
                         index = target->index;
                         // Erase any remaining wait time for the target.
@@ -2515,29 +2579,46 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                                    output, target->index);
                             --num_blocked_;
                             target->blocked_time = 0;
+                            target->unscheduled = false;
                         }
+                    } else if (unscheduled_priority_.find(target)) {
+                        target->unscheduled = false;
+                        unscheduled_priority_.erase(target);
+                        index = target->index;
+                        VPRINT(this, 2,
+                               "next_record[%d]: direct switch from input %d to "
+                               "was-unscheduled input %d "
+                               "@%" PRIu64 "\n",
+                               output, prev_index, target->index,
+                               inputs_[prev_index].reader->get_last_timestamp());
                     } else {
-                        // TODO i#5843: If the target is running on another output, we
-                        // need to do a forced migration by setting a flag to force a
-                        // preempt and presumably waiting (STATUS_WAIT or STATUS_IDLE?)
-                        // here until the input is available.
-                        // For now we print a message so we can notice when this
-                        // happens, but we ignore the direct switch request.
+                        // We assume that inter-input dependencies are captured in
+                        // the _DIRECT_THREAD_SWITCH, _UNSCHEDULE, and _SCHEDULE markers
+                        // and that if a switch request targets a thread running elsewhere
+                        // that means there isn't a dependence and this is really a
+                        // dynamic switch to whoever happens to be available (and
+                        // different timing between tracing and analysis has caused this
+                        // miss).
                         VPRINT(this, 1,
-                               "Direct switch target input #%d is running elsewhere and "
-                               "forced migration is NYI\n",
-                               target->index);
+                               "Direct switch (from %d) target input #%d is running "
+                               "elsewhere; picking a different target @%" PRIu64 "\n",
+                               prev_index, target->index,
+                               inputs_[prev_index].reader->get_last_timestamp());
+                        // We do ensure the missed target doesn't wait indefinitely.
+                        // XXX i#6822: It's not clear this is always the right thing to
+                        // do.
+                        target->skip_next_unscheduled = true;
                     }
                 }
                 if (index != INVALID_INPUT_ORDINAL) {
                     // We found a direct switch target above.
                 } else if (ready_queue_empty() && blocked_time == 0) {
                     if (prev_index == INVALID_INPUT_ORDINAL)
-                        return eof_or_idle(output);
+                        return eof_or_idle(output, need_lock);
                     auto lock = std::unique_lock<std::mutex>(*inputs_[prev_index].lock);
                     if (inputs_[prev_index].at_eof) {
                         lock.unlock();
-                        return eof_or_idle(output);
+                        return eof_or_idle(output, need_lock);
                     } else
                         index = prev_index; // Go back to prior.
                 } else {
@@ -2565,7 +2646,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                     }
                     if (queue_next == nullptr) {
                         assert(blocked_time == 0 || prev_index == INVALID_INPUT_ORDINAL);
-                        return eof_or_idle(output);
+                        return eof_or_idle(output, need_lock);
                     }
                     index = queue_next->index;
                 }
@@ -2580,7 +2661,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                     }
                 }
                 if (index < 0)
-                    return eof_or_idle(output);
+                    return eof_or_idle(output, need_lock);
                 VPRINT(this, 2,
                        "next_record[%d]: advancing to timestamp %" PRIu64
                        " == input #%d\n",
@@ -2669,6 +2750,105 @@ scheduler_tmpl_t<RecordType, ReaderType>::process_marker(input_info_t &input,
         } else {
             input.switch_to_input = it->second;
         }
+        // Trigger a switch either indefinitely or until timeout.
+        if (input.skip_next_unscheduled) {
+            // The underlying kernel mechanism being modeled only supports a single
+            // request: they cannot accumulate.  Timing differences in the trace could
+            // perhaps result in multiple lining up when the didn't in the real app;
+            // but changing the scheme here could also push representatives in the
+            // other direction.
+            input.skip_next_unscheduled = false;
+            VPRINT(this, 3,
+                   "input %d unschedule request ignored due to prior schedule request "
+                   "@%" PRIu64 "\n",
+                   input.index, input.reader->get_last_timestamp());
+            break;
+        }
+        input.unscheduled = true;
+        if (input.syscall_timeout_arg > 0) {
+            input.blocked_time = scale_blocked_time(input.syscall_timeout_arg);
+            VPRINT(this, 3, "input %d unscheduled for %" PRIu64 " @%" PRIu64 "\n",
+                   input.index, input.blocked_time, input.reader->get_last_timestamp());
+        } else {
+            VPRINT(this, 3, "input %d unscheduled indefinitely @%" PRIu64 "\n",
+                   input.index, input.reader->get_last_timestamp());
+        }
+        break;
+    }
+    case TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT:
+        // This is cleared at the post-syscall instr.
+        input.syscall_timeout_arg = static_cast<uint64_t>(marker_value);
+        break;
+    case TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE:
+        if (!options_.honor_direct_switches)
+            break;
+        if (input.skip_next_unscheduled) {
+            input.skip_next_unscheduled = false;
+            VPRINT(this, 3,
+                   "input %d unschedule request ignored due to prior schedule request "
+                   "@%" PRIu64 "\n",
+                   input.index, input.reader->get_last_timestamp());
+            break;
+        }
+        // Trigger a switch either indefinitely or until timeout.
+        input.unscheduled = true;
+        if (input.syscall_timeout_arg > 0) {
+            input.blocked_time = scale_blocked_time(input.syscall_timeout_arg);
+            VPRINT(this, 3, "input %d unscheduled for %" PRIu64 " @%" PRIu64 "\n",
+                   input.index, input.blocked_time, input.reader->get_last_timestamp());
+        } else {
+            VPRINT(this, 3, "input %d unscheduled indefinitely @%" PRIu64 "\n",
+                   input.index, input.reader->get_last_timestamp());
+        }
+        break;
+    case TRACE_MARKER_TYPE_SYSCALL_SCHEDULE: {
+        if (!options_.honor_direct_switches)
+            break;
+        memref_tid_t target_tid = marker_value;
+        auto it = tid2input_.find(workload_tid_t(input.workload, target_tid));
+        if (it == tid2input_.end()) {
+            VPRINT(this, 1,
+                   "Failed to find input for switchto::resume target tid %" PRId64 "\n",
+                   target_tid);
+            return;
+        }
+        input_ordinal_t target_idx = it->second;
+        VPRINT(this, 3, "input %d re-scheduling input %d @%" PRIu64 "\n", input.index,
+               target_idx, input.reader->get_last_timestamp());
+        // Release the input lock before acquiring sched_lock, to meet our lock
+        // ordering convention to avoid deadlocks.
+        input.lock->unlock();
+        {
+            bool need_sched_lock;
+            auto scoped_sched_lock =
+                acquire_scoped_sched_lock_if_necessary(need_sched_lock);
+            input_info_t *target = &inputs_[target_idx];
+            std::lock_guard<std::mutex> lock(*target->lock);
+            if (target->unscheduled) {
+                target->unscheduled = false;
+                if (unscheduled_priority_.find(target)) {
+                    add_to_ready_queue(target);
+                    unscheduled_priority_.erase(target);
+                } else if (ready_priority_.find(target)) {
+                    // We assume blocked_time is from _ARG_TIMEOUT and is not from
+                    // regularly-blocking i/o.  We assume i/o getting into the mix is
+                    // rare enough or does not matter enough to try to have separate
+                    // timeouts.
+                    if (target->blocked_time > 0) {
+                        VPRINT(
+                            this, 3,
+                            "switchto::resume erasing blocked time for target input %d\n",
+                            target->index);
+                        --num_blocked_;
+                        target->blocked_time = 0;
+                    }
+                }
+            } else {
+                VPRINT(this, 3, "input %d will skip next unschedule\n", target_idx);
+                target->skip_next_unscheduled = true;
+            }
+        }
+        input.lock->lock();
         break;
     }
     default: // Nothing to do.
@@ -2719,7 +2899,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     if (outputs_[output].cur_input < 0) {
         // This happens with more outputs than inputs.  For non-empty outputs we
         // require cur_input to be set to >=0 during init().
-        return eof_or_idle(output);
+        return eof_or_idle(output, /*hold_sched_lock=*/false);
     }
     input = &inputs_[outputs_[output].cur_input];
     auto lock = std::unique_lock<std::mutex>(*input->lock);
@@ -2794,6 +2974,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         bool preempt = false;
         uint64_t blocked_time = 0;
         uint64_t prev_time_in_quantum = 0;
+        // XXX i#6831: Refactor to use subclasses or templates to specialize
+        // scheduler code based on mapping options, to avoid these top-level
+        // conditionals in many functions?
         if (options_.mapping == MAP_AS_PREVIOUSLY) {
             assert(outputs_[output].record_index >= 0);
             if (outputs_[output].record_index >=
@@ -2833,34 +3016,48 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         } else if (options_.mapping == MAP_TO_ANY_OUTPUT) {
             trace_marker_type_t marker_type;
             uintptr_t marker_value;
-            // While regular traces typically always have a syscall marker when there's a
-            // maybe-blocking marker, some tests and synthetic traces have just the maybe
-            // so we check both.
+            // While regular traces typically always have a syscall marker when
+            // there's a maybe-blocking marker, some tests and synthetic traces have
+            // just the maybe so we check both.
             if (input->processing_syscall || input->processing_maybe_blocking_syscall) {
                 // Wait until we're past all the markers associated with the syscall.
-                // XXX: We may prefer to stop before the return value marker for futex,
-                // or a kernel xfer marker, but our recorded format is on instr
+                // XXX: We may prefer to stop before the return value marker for
+                // futex, or a kernel xfer marker, but our recorded format is on instr
                 // boundaries so we live with those being before the switch.
                 // XXX: Once we insert kernel traces, we may have to try harder
                 // to stop before the post-syscall records.
                 if (record_type_is_instr_boundary(record, outputs_[output].last_record)) {
-                    if (syscall_incurs_switch(input, blocked_time)) {
-                        // Model as blocking and should switch to a different input.
+                    if (input->switch_to_input != INVALID_INPUT_ORDINAL) {
+                        // The switch request overrides any latency threshold.
                         need_new_input = true;
                         VPRINT(this, 3,
-                               "next_record[%d]: hit blocking syscall in input %d\n",
+                               "next_record[%d]: direct switch on low-latency "
+                               "syscall in "
+                               "input %d\n",
                                output, input->index);
-                    } else if (input->switch_to_input != INVALID_INPUT_ORDINAL) {
-                        // The switch request overrides any latency threshold.
+                    } else if (input->blocked_time > 0) {
+                        // If we've found out another way that this input should
+                        // block, use that time and do a switch.
                         need_new_input = true;
+                        blocked_time = input->blocked_time;
                         VPRINT(this, 3,
-                               "next_record[%d]: direct switch on low-latency syscall in "
-                               "input %d\n",
+                               "next_record[%d]: blocked time set for input %d\n", output,
+                               input->index);
+                    } else if (input->unscheduled) {
+                        need_new_input = true;
+                        VPRINT(this, 3, "next_record[%d]: input %d going unscheduled\n",
+                               output, input->index);
+                    } else if (syscall_incurs_switch(input, blocked_time)) {
+                        // Model as blocking and should switch to a different input.
+                        need_new_input = true;
+                        VPRINT(this, 3,
+                               "next_record[%d]: hit blocking syscall in input %d\n",
                                output, input->index);
                     }
                     input->processing_syscall = false;
                     input->processing_maybe_blocking_syscall = false;
                     input->pre_syscall_timestamp = 0;
+                    input->syscall_timeout_arg = 0;
                 }
             }
             if (outputs_[output].hit_switch_code_end) {
@@ -3099,8 +3296,12 @@ scheduler_tmpl_t<RecordType, ReaderType>::mark_input_eof(input_info_t &input)
 
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
-scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output)
+scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
+                                                      bool hold_sched_lock)
 {
+    // XXX i#6831: Refactor to use subclasses or templates to specialize
+    // scheduler code based on mapping options, to avoid these top-level
+    // conditionals in many functions?
     if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT ||
         live_input_count_.load(std::memory_order_acquire) == 0 ||
         // While a full schedule recorded should have each input hit either its
@@ -3111,6 +3312,49 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output)
         assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof);
         return sched_type_t::STATUS_EOF;
     } else {
+        if (options_.mapping == MAP_TO_ANY_OUTPUT) {
+            // Workaround to avoid hangs when _SCHEDULE and/or _DIRECT_THREAD_SWITCH
+            // directives miss their targets (due to running with a subset of the
+            // original threads, or other scenarios) and we end up with no scheduled
+            // inputs but a set of unscheduled inputs who will never be scheduled.
+            auto scoped_lock = hold_sched_lock
+                ? std::unique_lock<std::mutex>()
+                : std::unique_lock<std::mutex>(sched_lock_);
+            VPRINT(this, 4,
+                   "eof_or_idle output=%d live=%d unsched=%zu runq=%zu blocked=%d\n",
+                   output, live_input_count_.load(std::memory_order_acquire),
+                   unscheduled_priority_.size(), ready_priority_.size(), num_blocked_);
+            if (ready_priority_.empty() && !unscheduled_priority_.empty()) {
+                if (outputs_[output].wait_start_time == 0) {
+                    outputs_[output].wait_start_time = get_output_time(output);
+                } else {
+                    uint64_t now = get_output_time(output);
+                    if (now - outputs_[output].wait_start_time >
+                        options_.block_time_max) {
+                        // XXX i#6822: We may want some other options here for what to
+                        // do.  We could release just one input at a time, which would be
+                        // the same scheduling order (as we have FIFO in
+                        // unscheduled_priority_) but may take a long time at
+                        // block_time_max each; we could declare we're done and just
+                        // exit, maybe under a flag or if we could see what % of total
+                        // records we've processed.
+                        VPRINT(this, 1,
+                               "eof_or_idle moving entire unscheduled queue to ready "
+                               "queue\n");
+                        while (!unscheduled_priority_.empty()) {
+                            input_info_t *tomove = unscheduled_priority_.top();
+                            std::lock_guard<std::mutex> lock(*tomove->lock);
+                            tomove->unscheduled = false;
+                            ready_priority_.push(tomove);
+                            unscheduled_priority_.pop();
+                        }
+                        outputs_[output].wait_start_time = 0;
+                    }
+                }
+            } else {
+                outputs_[output].wait_start_time = 0;
+            }
+        }
         outputs_[output].waiting = true;
         set_cur_input(output, INVALID_INPUT_ORDINAL);
         return sched_type_t::STATUS_IDLE;
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index e0fe4401c25..5ca8207512f 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -578,6 +578,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * #QUANTUM_TIME simulator time or wall-clock microseconds for
          * #QUANTUM_INSTRUCTIONS), for an input to be considered blocked for any one
          * system call.  This is applied after multiplying by #block_time_scale.
+         * This is also used as a fallback to avoid hangs when there are no scheduled
+         * inputs: if the only inputs left are "unscheduled" (see
+         * #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE), after this amount of time those
+         * inputs are all re-scheduled.
          */
         uint64_t block_time_max = 25000000;
         // XXX: Should we share the file-to-reader code currently in the scheduler
@@ -639,13 +643,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         /**
          * If true, the scheduler will attempt to switch to the recorded targets of
          * #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH system call metadata markers
-         * regardless of system call latency.  If the target is not available, the
-         * current implementation will select the next available input in the regular
-         * scheduling queue, but in the future a forced migration may be applied for an
-         * input currently on another output. If false, the direct switch markers are
-         * ignored and only system call latency thresholds are used to determine
-         * switches (the #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH markers remain: they
-         * are not removed from the trace).
+         * regardless of system call latency.  Furthermore, the scheduler will model
+         * "unscheduled" semantics and honor the #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE
+         * and #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE markers.  If false, these markers are
+         * ignored and only system call latency thresholds are used to determine switches
+         * (these markers remain: they are not removed from the trace).
          */
         bool honor_direct_switches = true;
     };
@@ -1205,7 +1207,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // with schedule_t "this" access for the comparator to compile: it is not
         // simple to do so, however.)
         bool order_by_timestamp = false;
-        // Global ready queue counter used to provide FIFO for same-priority inputs.
+        // Global queue counter used to provide FIFO for same-priority inputs.
+        // This value is only valid when this input is in a queue; it is set upon
+        // being added to a queue.
         uint64_t queue_counter = 0;
         // Used to switch on the instruction *after* a long-latency syscall.
         bool processing_syscall = false;
@@ -1214,6 +1218,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // Use for special kernel features where one thread specifies a target
         // thread to replace it.
         input_ordinal_t switch_to_input = INVALID_INPUT_ORDINAL;
+        uint64_t syscall_timeout_arg = 0;
         // Used to switch before we've read the next instruction.
         bool switching_pre_instruction = false;
         // Used for time-based quanta.
@@ -1223,6 +1228,15 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // The units are us for instr quanta and simuilation time for time quanta.
         uint64_t blocked_time = 0;
         uint64_t blocked_start_time = 0;
+        // An input can be "unscheduled" and not on the ready_priority_ run queue at all
+        // with an infinite timeout until directly targeted.  Such inputs are stored
+        // in the unscheduled_priority_ queue.
+        // This field is also set to true for inputs that are "unscheduled" but with
+        // a timeout, even though that is implemented by storing them in ready_priority_
+        // (because that is our mechanism for measuring timeouts).
+        bool unscheduled = false;
+        // Causes the next unscheduled entry to abort.
+        bool skip_next_unscheduled = false;
     };
 
     // Format for recording a schedule to disk.  A separate sequence of these records
@@ -1628,8 +1642,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     void
     mark_input_eof(input_info_t &input);
 
+    // Determines whether to exit or wait for other outputs when one output
+    // runs out of things to do.  May end up scheduling new inputs.
     stream_status_t
-    eof_or_idle(output_ordinal_t output);
+    eof_or_idle(output_ordinal_t output, bool hold_sched_lock);
 
     // Returns whether the current record for the current input stream scheduled on
     // the 'output_ordinal'-th output stream is from a part of the trace corresponding
@@ -1660,14 +1676,29 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         }
     };
 
+    bool
+    need_sched_lock();
+
+    std::unique_lock<std::mutex>
+    acquire_scoped_sched_lock_if_necessary(bool &need_lock);
+
     // sched_lock_ must be held by the caller.
     bool
     ready_queue_empty();
 
     // sched_lock_ must be held by the caller.
+    // If input->unscheduled is true and input->blocked_time is 0, input
+    // is placed on the unscheduled_priority_ queue instead.
     void
     add_to_ready_queue(input_info_t *input);
 
+    // sched_lock_ must be held by the caller.
+    void
+    add_to_unscheduled_queue(input_info_t *input);
+
+    uint64_t
+    scale_blocked_time(uint64_t blocked_time) const;
+
     // The input's lock must be held by the caller.
     // Returns a multiplier for how long the input should be considered blocked.
     bool
@@ -1694,18 +1725,23 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     std::vector<output_info_t> outputs_;
     // We use a central lock for global scheduling.  We assume the synchronization
     // cost is outweighed by the simulator's overhead.  This protects concurrent
-    // access to inputs_.size(), outputs_.size(), ready_priority_, and
-    // ready_counter_.
+    // access to inputs_.size(), outputs_.size(), ready_priority_,
+    // ready_counter_, unscheduled_priority_, and unscheduled_counter_.
+    // This cannot be acquired while holding an input lock: it must
+    // be acquired first, to avoid deadlocks.
     std::mutex sched_lock_;
     // Inputs ready to be scheduled, sorted by priority and then timestamp if timestamp
     // dependencies are requested.  We use the timestamp delta from the first observed
     // timestamp in each workload in order to mix inputs from different workloads in the
     // same queue.  FIFO ordering is used for same-priority entries.
     flexible_queue_t<input_info_t *, InputTimestampComparator> ready_priority_;
+    // Inputs that are unscheduled indefinitely unless directly targeted.
+    flexible_queue_t<input_info_t *, InputTimestampComparator> unscheduled_priority_;
     // Trackes the count of blocked inputs.  Protected by sched_lock_.
     int num_blocked_ = 0;
-    // Global ready queue counter used to provide FIFO for same-priority inputs.
+    // Global queue counters used to provide FIFO for same-priority inputs.
     uint64_t ready_counter_ = 0;
+    uint64_t unscheduled_counter_ = 0;
     // Count of inputs not yet at eof.
     std::atomic<int> live_input_count_;
     // In replay mode, count of outputs not yet at the end of the replay sequence.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index e0ed0702f92..eeec09c073c 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -3734,10 +3734,13 @@ static void
 test_direct_switch()
 {
     std::cerr << "\n----------------\nTesting direct switches\n";
+    // This tests just direct switches with no unscheduled inputs or related
+    // switch requests.
     // We have just 1 output to better control the order and avoid flakiness.
     static constexpr int NUM_OUTPUTS = 1;
     static constexpr int QUANTUM_DURATION = 100; // Never reached.
     static constexpr int BLOCK_LATENCY = 100;
+    static constexpr int SWITCH_TIMEOUT = 2000;
     static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
     static constexpr memref_tid_t TID_BASE = 100;
     static constexpr memref_tid_t TID_A = TID_BASE + 0;
@@ -3756,6 +3759,9 @@ test_direct_switch()
         make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
         make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
         make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        // This test focuses on direct only with nothing "unscheduled";
+        // thus, we always provide a timeout to avoid going unscheduled.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT),
         make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_C),
         make_timestamp(4001),
         make_instr(/*pc=*/401),
@@ -3786,11 +3792,16 @@ test_direct_switch()
         make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
         make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
         make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        // This test focuses on direct only with nothing "unscheduled";
+        // thus, we always provide a timeout to avoid going unscheduled.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT),
         make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_A),
         make_timestamp(5001),
         make_instr(/*pc=*/501),
         // Test a non-existent target: should be ignored, but not crash.
         make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        // This test focuses on direct only with nothing "unscheduled".
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT),
         make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_BASE + 3),
         make_exit(TID_C),
     };
@@ -3809,8 +3820,7 @@ test_direct_switch()
         // has significant blocked time left.  But then after B is scheduled and finishes,
         // we still have to wait for C's block time so we see idle underscores:
         static const char *const CORE0_SCHED_STRING =
-            "...AA.........CC......A....BBBB.______________C...";
-
+            "...AA..........CC.......A....BBBB.__________C....";
         std::vector<scheduler_t::input_workload_t> sched_inputs;
         sched_inputs.emplace_back(std::move(readers));
         scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
@@ -3846,7 +3856,389 @@ test_direct_switch()
         // We expect A's first switch to be to B with an earlier timestamp.
         // We expect C's direct switch to A to not happen until A's blocked time ends.
         static const char *const CORE0_SCHED_STRING =
-            "...AA.........BBBB....CC......___________________C...___A.";
+            "...AA..........BBBB....CC.......___________________C....___A.";
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/3);
+        sched_ops.quantum_duration = QUANTUM_DURATION;
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.honor_direct_switches = false;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+    }
+}
+
+static void
+test_unscheduled()
+{
+    std::cerr << "\n----------------\nTesting unscheduled inputs\n";
+    // We have just 1 output to better control the order and avoid flakiness.
+    static constexpr int NUM_OUTPUTS = 1;
+    static constexpr int QUANTUM_DURATION = 100; // Never reached.
+    static constexpr int BLOCK_LATENCY = 100;
+    static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
+    static constexpr int SWITCH_TIMEOUT = 2000;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr memref_tid_t TID_A = TID_BASE + 0;
+    static constexpr memref_tid_t TID_B = TID_BASE + 1;
+    static constexpr memref_tid_t TID_C = TID_BASE + 2;
+    std::vector<trace_entry_t> refs_A = {
+        make_thread(TID_A),
+        make_pid(1),
+        make_version(TRACE_ENTRY_VERSION),
+        // A has the earliest timestamp and starts.
+        make_timestamp(1001),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(/*pc=*/101),
+        make_instr(/*pc=*/102),
+        make_timestamp(1002),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        // Test going unscheduled with no timeout.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0),
+        make_timestamp(4202),
+        // B makes us scheduled again.
+        make_instr(/*pc=*/103),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        // Switch to B to test a direct switch to unscheduled.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT),
+        make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_B),
+        make_timestamp(4402),
+        make_instr(/*pc=*/401),
+        make_exit(TID_A),
+    };
+    std::vector<trace_entry_t> refs_B = {
+        make_thread(TID_B),
+        make_pid(1),
+        make_version(TRACE_ENTRY_VERSION),
+        // B runs next by timestamp.
+        make_timestamp(2001),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(/*pc=*/200),
+        // B goes unscheduled with a timeout.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0),
+        // C will run at this point.
+        // Then, C blocks and our timeout lapses and we run again.
+        make_timestamp(4001),
+        make_instr(/*pc=*/201),
+        // B tells C to not go unscheduled later.
+        make_timestamp(4002),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, TID_C),
+        make_timestamp(4004),
+        make_instr(/*pc=*/202),
+        // B makes A no longer unscheduled.
+        make_timestamp(4006),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, TID_A),
+        make_timestamp(4011),
+        make_instr(/*pc=*/202),
+        // B now goes unscheduled with no timeout.
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0),
+        // A switches to us.
+        make_instr(/*pc=*/203),
+        make_instr(/*pc=*/204),
+        make_instr(/*pc=*/205),
+        make_instr(/*pc=*/206),
+        make_exit(TID_B),
+    };
+    std::vector<trace_entry_t> refs_C = {
+        make_thread(TID_C),
+        make_pid(1),
+        make_version(TRACE_ENTRY_VERSION),
+        // C goes 3rd by timestamp.
+        make_timestamp(3001),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(/*pc=*/301),
+        make_instr(/*pc=*/302),
+        make_timestamp(3002),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        // C makes a long-latency blocking syscall, testing whether
+        // A is still unscheduled.
+        // We also test _SCHEDULE avoiding a future unschedule when C
+        // unblocks.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_timestamp(7002),
+        make_instr(/*pc=*/501),
+        // C asks to go unscheduled with no timeout, but a prior _SCHEDULE
+        // means it just continues.
+        make_timestamp(7004),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0),
+        make_timestamp(7008),
+        make_instr(/*pc=*/502),
+        make_exit(TID_C),
+    };
+    {
+        // Test the defaults with direct switches enabled.
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_A)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_A);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_B)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_B);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_C)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_C);
+        // The string constructor writes "." for markers.
+        // Matching the comments above, we expect A to go unscheduled;
+        // Then B runs and goes unscheduled-with-timeout; C takes over and blocks.
+        // We then have _=idle confirming A is unscheduled and B blocked.
+        // B then runs and makes A schedulable before going unscheduled.
+        // A then runs and switches back to B with a timeout.  B exits; A's timeout
+        // has lapsed so it runs; finally we wait idle for C's long block to finish,
+        // after which C runs and *does not unschedule* b/c of B's prior request.
+        static const char *const CORE0_SCHED_STRING =
+            "...AA.........B........CC.....______________B......B......B....A......BBBB."
+            "A._________________C......C.";
+
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/3);
+        sched_ops.quantum_duration = QUANTUM_DURATION;
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        sched_ops.block_time_scale = BLOCK_SCALE;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+    }
+    {
+        // Test disabling direct switches.
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_A)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_A);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_B)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_B);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_C)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_C);
+        // The syscall latencies make this schedule not all that different: we just
+        // finish B instead of switching to A toward the end.
+        static const char *const CORE0_SCHED_STRING =
+            "...AA.........B........CC.....__________________B......B......B....BBBB.____"
+            "A......__A._______C......C.";
+
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/3);
+        sched_ops.quantum_duration = QUANTUM_DURATION;
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.honor_direct_switches = false;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+    }
+}
+
+static void
+test_unscheduled_fallback()
+{
+    std::cerr << "\n----------------\nTesting unscheduled hang workarounds\n";
+    // We have just 1 output to better control the order and avoid flakiness.
+    static constexpr int NUM_OUTPUTS = 1;
+    static constexpr int QUANTUM_DURATION = 100; // Never reached.
+    static constexpr int BLOCK_LATENCY = 100;
+    static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
+    static constexpr uint64_t BLOCK_TIME_MAX = 500;
+    static constexpr int SWITCH_TIMEOUT = 2000;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr memref_tid_t TID_A = TID_BASE + 0;
+    static constexpr memref_tid_t TID_B = TID_BASE + 1;
+    static constexpr memref_tid_t TID_C = TID_BASE + 2;
+    std::vector<trace_entry_t> refs_A = {
+        make_thread(TID_A),
+        make_pid(1),
+        make_version(TRACE_ENTRY_VERSION),
+        // A has the earliest timestamp and starts.
+        make_timestamp(1001),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(/*pc=*/101),
+        make_instr(/*pc=*/102),
+        make_timestamp(1002),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        // Test going unscheduled with no timeout.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0),
+        make_timestamp(4202),
+        // B makes us scheduled again.
+        make_instr(/*pc=*/102),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        // Switch to a missing thread to leave us unscheduled; B also went
+        // unscheduled, leaving nothing scheduled, to test hang workarounds.
+        make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_BASE + 4),
+        make_timestamp(4402),
+        // We won't get here until the no-scheduled-input hang workaround.
+        make_instr(/*pc=*/401),
+        make_exit(TID_A),
+    };
+    std::vector<trace_entry_t> refs_B = {
+        make_thread(TID_B),
+        make_pid(1),
+        make_version(TRACE_ENTRY_VERSION),
+        // B runs next by timestamp.
+        make_timestamp(2001),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(/*pc=*/200),
+        // B goes unscheduled with a timeout.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0),
+        // C will run at this point.
+        // Then, C blocks and our timeout lapses and we run again.
+        make_timestamp(4001),
+        make_instr(/*pc=*/201),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        // B makes A no longer unscheduled.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, TID_A),
+        make_timestamp(4011),
+        make_instr(/*pc=*/202),
+        // B now goes unscheduled with no timeout.
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0),
+        // We won't get here until the hang workaround.
+        make_instr(/*pc=*/203),
+        make_instr(/*pc=*/204),
+        make_instr(/*pc=*/205),
+        make_instr(/*pc=*/206),
+        make_exit(TID_B),
+    };
+    std::vector<trace_entry_t> refs_C = {
+        make_thread(TID_C),
+        make_pid(1),
+        make_version(TRACE_ENTRY_VERSION),
+        // C goes 3rd by timestamp.
+        make_timestamp(3001),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(/*pc=*/301),
+        make_instr(/*pc=*/302),
+        make_timestamp(3002),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        // C makes a long-latency blocking syscall, testing whether
+        // A is still unscheduled.
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 999),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_timestamp(7002),
+        make_instr(/*pc=*/501),
+        make_exit(TID_C),
+    };
+    {
+        // Test the defaults with direct switches enabled.
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_A)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_A);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_B)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_B);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_C)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_C);
+        // This looks like the schedule in test_unscheduled() up until "..A.." when
+        // we have a BLOCK_TIME_MAX-long idle period:
+        static const char *const CORE0_SCHED_STRING =
+            "...AA.........B........CC.....______________B......B....A.....______________"
+            "_________C._________________________________________________________________"
+            "____________________________________________________________________________"
+            "____________________________________________________________________________"
+            "____________________________________________________________________________"
+            "____________________________________________________________________________"
+            "____________________________________________________________________________"
+            "_________________________________________________________BBBB.A.";
+
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        sched_inputs.emplace_back(std::move(readers));
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/3);
+        sched_ops.quantum_duration = QUANTUM_DURATION;
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_max = BLOCK_TIME_MAX;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+    }
+    {
+        // Test disabling direct switches.
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_A)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_A);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_B)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_B);
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_C)),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_C);
+        // This result is identical to the one in test_unscheduled().
+        static const char *const CORE0_SCHED_STRING =
+            "...AA.........B........CC.....__________________B......B....BBBB._____A....."
+            "__A._______C.";
 
         std::vector<scheduler_t::input_workload_t> sched_inputs;
         sched_inputs.emplace_back(std::move(readers));
@@ -3859,6 +4251,7 @@ test_direct_switch()
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
         sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
         sched_ops.block_time_scale = BLOCK_SCALE;
+        sched_ops.block_time_max = BLOCK_TIME_MAX;
         sched_ops.honor_direct_switches = false;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
@@ -4382,6 +4775,8 @@ test_main(int argc, const char *argv[])
     test_replay_as_traced_sort();
     test_inactive();
     test_direct_switch();
+    test_unscheduled();
+    test_unscheduled_fallback();
     test_kernel_switch_sequences();
     test_random_schedule();
     test_record_scheduler();