diff --git a/api/docs/release.dox b/api/docs/release.dox
index aa1fbb21fe4..bd672d69890 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -132,6 +132,8 @@ Further non-compatibility-affecting changes include:
  - Added new fields analyze_case_ex and instrument_instr_ex to #drbbdup_options_t.
  - Added drbbdup support to drwrap via #DRWRAP_INVERT_CONTROL, drwrap_invoke_insert(),
    and drwrap_invoke_insert_cleanup_only().
+ - Added -trace_for_instrs and -retrace_every_instrs options to drcachesim
+   for periodic trace bustrs of an unmodified application.
 
 The changes between version 9.0.1 and 9.0.0 include the following compatibility
 changes:
@@ -449,8 +451,6 @@ Further non-compatibility-affecting changes include:
    dr_get_process_id() in some contexts.
  - The private loader's malloc redirection now guarantees double-pointer-sized
    alignment, to match what system-provided allocators use.
- - Added a new DR extension, namely "drbbdup", which enables different case
-   instrumentation of the same basic block by duplicating code.
 
 **************************************************
 <hr>
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 8e3a5428049..7c0c5d13fe6 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2015-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2015-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -240,15 +240,30 @@ droption_t<bytesize_t> op_trace_after_instrs(
     DROPTION_SCOPE_CLIENT, "trace_after_instrs", 0,
     "Do not start tracing until N instructions",
     "If non-zero, this causes tracing to be suppressed until this many dynamic "
-    "instruction "
-    "executions are observed.  At that point, regular tracing is put into place. "
+    "instruction executions are observed from the start of the application. "
+    "At that point, regular tracing is put into place.  "
     "The threshold should be considered approximate, especially for larger values. "
-    "Switching to regular tracing takes some amount of time during which other "
-    "threads than the one that triggered the switch can continue to execute, "
-    "resulting in a larger count of executed instructions before tracing actually "
-    "starts than this given threshold. "
-    "Use -max_trace_size or -max_global_trace_refs to set a limit on the subsequent "
-    "trace length.");
+    "Use -trace_for_instrs or -max_trace_size to set a limit on the subsequent trace "
+    "length.  Use -retrace_every_instrs to trace repeatedly.");
+
+droption_t<bytesize_t> op_trace_for_instrs(
+    DROPTION_SCOPE_CLIENT, "trace_for_instrs", 0,
+    "After tracing N instructions, stop tracing, but continue executing.",
+    "If non-zero, this stops recording a trace after the specified number of "
+    "instructions are traced.  Unlike -exit_after_tracing, which kills the "
+    "application (and counts data as well as instructions), the application "
+    "continues executing.  This can be combined with -retrace_every_instrs. "
+    "The actual trace period may vary slightly from this number due to optimizations "
+    "that reduce the overhead of instruction counting.");
+
+droption_t<bytesize_t> op_retrace_every_instrs(
+    DROPTION_SCOPE_CLIENT, "retrace_every_instrs", 0,
+    "Trace for -trace_for_instrs, execute this many, and repeat.",
+    "This option augments -trace_for_instrs.  After tracing concludes, this option "
+    "causes non-traced instructions to be counted and after the number specified by "
+    "this option, tracing start up again for the -trace_for_instrs duration.  This "
+    "process repeats itself.  This can be combined with -trace_after_instrs for an "
+    "initial period of non-tracing.");
 
 droption_t<bytesize_t> op_exit_after_tracing(
     DROPTION_SCOPE_CLIENT, "exit_after_tracing", 0,
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index aba7343b8a2..45f5ab58f53 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -89,6 +89,8 @@ extern droption_t<bool> op_cpu_scheduling;
 extern droption_t<bytesize_t> op_max_trace_size;
 extern droption_t<bytesize_t> op_max_global_trace_refs;
 extern droption_t<bytesize_t> op_trace_after_instrs;
+extern droption_t<bytesize_t> op_trace_for_instrs;
+extern droption_t<bytesize_t> op_retrace_every_instrs;
 extern droption_t<bytesize_t> op_exit_after_tracing;
 extern droption_t<bool> op_online_instr_types;
 extern droption_t<std::string> op_replace_policy;
diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h
index 420e007ee13..bbde2ba41b5 100644
--- a/clients/drcachesim/common/trace_entry.h
+++ b/clients/drcachesim/common/trace_entry.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2015-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2015-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -326,6 +326,14 @@ typedef enum {
      */
     TRACE_MARKER_TYPE_RSEQ_ABORT,
 
+    /**
+     * Identifies in the marker value the ordinal of a window during a multi-window
+     * tracing run (see the options -trace_for_instrs and -retrace_every_instrs).
+     * When a marker with an ordinal value different from the last-seen marker
+     * appears, a time gap may exist between this new marker and subsequent entries.
+     */
+    TRACE_MARKER_TYPE_WINDOW_ID,
+
     // ...
     // These values are reserved for future built-in marker types.
     // ...
diff --git a/clients/drcachesim/drcachesim.dox.in b/clients/drcachesim/drcachesim.dox.in
index 0af6e9419ed..b03e0e6163f 100644
--- a/clients/drcachesim/drcachesim.dox.in
+++ b/clients/drcachesim/drcachesim.dox.in
@@ -161,6 +161,8 @@ Some of the more important markers are:
 
 - #TRACE_MARKER_TYPE_FUNC_ID, #TRACE_MARKER_TYPE_FUNC_RETADDR, #TRACE_MARKER_TYPE_FUNC_ARG, #TRACE_MARKER_TYPE_FUNC_RETVAL - These markers are used to capture information about function calls.  Which functions to capture must be explicitly selected at tracing time.  Typical candiates are heap allocation and freeing functions.  See \ref sec_drcachesim_funcs.
 
+- #TRACE_MARKER_TYPE_WINDOW_ID - The marker value contains the ordinal of the trace burst window when a multi-window trace was collected (see \ref sec_drcachesim_partial).
+
 The full set of markers is listed under the enum #trace_marker_type_t.
 
 ****************************************************************************
@@ -954,6 +956,15 @@ dynamic instruction executions.  This can be used to skip initialization
 and arrive at the desired starting point.  The trace's length can be
 limited in several ways:
 
+- The \p -trace_for_instrs option stops tracing after the specified number
+  of dynamic instrutions.
+- The \p -retrace_every_instrs option augments -p -trace_for_instrs by
+  executing its specified instruction count without tracing and then
+  re-enabling tracing for \p -trace_for_instrs again, resulting in
+  tracing windows repeated at regular intervals throughout the execution.
+  A single final trace is created at the end, with #TRACE_MARKER_TYPE_WINDOW_ID
+  markers (see \ref sec_drcachesim_format_other) identifying the trace window
+  transitions.
 - The \p -max_global_trace_refs option causes the recording of trace
   data to cease once the specified threshold is exceeded by the sum of
   all trace references across all threads.  One trace reference entry
diff --git a/clients/drcachesim/tests/allasm-repstr-basic-counts.templatex b/clients/drcachesim/tests/allasm-repstr-basic-counts.templatex
index f8c7cdafc74..650ea6c0bcd 100644
--- a/clients/drcachesim/tests/allasm-repstr-basic-counts.templatex
+++ b/clients/drcachesim/tests/allasm-repstr-basic-counts.templatex
@@ -1,10 +1,18 @@
 Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
 ---- <application exited with code 0> ----
 Basic counts tool results:
 Total counts:
-          19 total \(fetched\) instructions
-          19 total unique \(fetched\) instructions
-/* One for each movs after the first one. */
+          95 total \(fetched\) instructions
+          23 total unique \(fetched\) instructions
            4 total non-fetched instructions
            0 total prefetches
            5 total data loads
@@ -12,26 +20,26 @@ Total counts:
            0 total icache flushes
            0 total dcache flushes
            1 total threads
-     .* total scheduling markers
-     .* total transfer markers
-     .* total function id markers
-     .* total function return address markers
-     .* total function argument markers
-     .* total function return value markers
-     .* total other markers
-Thread .* counts:
-          19 \(fetched\) instructions
-          19 unique \(fetched\) instructions
+          24 total scheduling markers
+           0 total transfer markers
+           0 total function id markers
+           0 total function return address markers
+           0 total function argument markers
+           0 total function return value markers
+           3 total other markers
+Thread [0-9]* counts:
+          95 \(fetched\) instructions
+          23 unique \(fetched\) instructions
            4 non-fetched instructions
            0 prefetches
            5 data loads
            5 data stores
            0 icache flushes
            0 dcache flushes
-     .* scheduling markers
-     .* transfer markers
-     .* function id markers
-     .* function return address markers
-     .* function argument markers
-     .* function return value markers
-     .* other markers
+          24 scheduling markers
+           0 transfer markers
+           0 function id markers
+           0 function return address markers
+           0 function argument markers
+           0 function return value markers
+           3 other markers
diff --git a/clients/drcachesim/tests/allasm_repstr.asm b/clients/drcachesim/tests/allasm_repstr.asm
index ec7e2d10480..ec5a2e6ad63 100644
--- a/clients/drcachesim/tests/allasm_repstr.asm
+++ b/clients/drcachesim/tests/allasm_repstr.asm
@@ -1,5 +1,5 @@
  /* **********************************************************
- * Copyright (c) 2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2021-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -53,12 +53,17 @@ _start:
         cld
         rep      movsb
 
-        // Print end message.
+        // Print a message in a loop for testing tracing windows.
+        mov      ebx, 10          // Loop count.
+repeat:
         mov      rdi, 2           // stderr
         lea      rsi, hello_str
         mov      rdx, 13          // sizeof(hello_str)
         mov      eax, 1           // SYS_write
         syscall
+        dec      ebx
+        cmp      ebx, 0
+        jnz      repeat
 
         // Exit.
         mov      rdi, 0           // exit code
diff --git a/clients/drcachesim/tests/offline-allasm-repstr-basic-counts.templatex b/clients/drcachesim/tests/offline-allasm-repstr-basic-counts.templatex
index 585532a8ba3..3c22bd6a8dc 100644
--- a/clients/drcachesim/tests/offline-allasm-repstr-basic-counts.templatex
+++ b/clients/drcachesim/tests/offline-allasm-repstr-basic-counts.templatex
@@ -1,9 +1,17 @@
 Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
 Basic counts tool results:
 Total counts:
-          19 total \(fetched\) instructions
-          19 total unique \(fetched\) instructions
-/* One for each movs after the first one. */
+          95 total \(fetched\) instructions
+          23 total unique \(fetched\) instructions
            4 total non-fetched instructions
            0 total prefetches
            5 total data loads
@@ -11,26 +19,26 @@ Total counts:
            0 total icache flushes
            0 total dcache flushes
            1 total threads
-     .* total scheduling markers
-     .* total transfer markers
-     .* total function id markers
-     .* total function return address markers
-     .* total function argument markers
-     .* total function return value markers
-     .* total other markers
-Thread .* counts:
-          19 \(fetched\) instructions
-          19 unique \(fetched\) instructions
+          24 total scheduling markers
+           0 total transfer markers
+           0 total function id markers
+           0 total function return address markers
+           0 total function argument markers
+           0 total function return value markers
+           3 total other markers
+Thread [0-9]* counts:
+          95 \(fetched\) instructions
+          23 unique \(fetched\) instructions
            4 non-fetched instructions
            0 prefetches
            5 data loads
            5 data stores
            0 icache flushes
            0 dcache flushes
-     .* scheduling markers
-     .* transfer markers
-     .* function id markers
-     .* function return address markers
-     .* function argument markers
-     .* function return value markers
-     .* other markers
+          24 scheduling markers
+           0 transfer markers
+           0 function id markers
+           0 function return address markers
+           0 function argument markers
+           0 function return value markers
+           3 other markers
diff --git a/clients/drcachesim/tests/offline-windows-asm.templatex b/clients/drcachesim/tests/offline-windows-asm.templatex
new file mode 100644
index 00000000000..08ac8845319
--- /dev/null
+++ b/clients/drcachesim/tests/offline-windows-asm.templatex
@@ -0,0 +1,169 @@
+Hit delay threshold: enabling tracing.
+Hit tracing window #0 limit: disabling tracing.
+Adios world!
+Hit retrace threshold: enabling tracing.
+Adios world!
+Hit tracing window #1 limit: disabling tracing.
+Adios world!
+Hit retrace threshold: enabling tracing.
+Adios world!
+Hit tracing window #2 limit: disabling tracing.
+Adios world!
+Hit retrace threshold: enabling tracing.
+Adios world!
+Hit tracing window #3 limit: disabling tracing.
+Adios world!
+Hit retrace threshold: enabling tracing.
+Adios world!
+Hit tracing window #4 limit: disabling tracing.
+Adios world!
+Hit retrace threshold: enabling tracing.
+Adios world!
+Hit tracing window #5 limit: disabling tracing.
+Basic counts tool results:
+Total counts:
+          50 total \(fetched\) instructions
+          18 total unique \(fetched\) instructions
+           4 total non-fetched instructions
+           0 total prefetches
+           5 total data loads
+           5 total data stores
+           0 total icache flushes
+           0 total dcache flushes
+           1 total threads
+          14 total scheduling markers
+           0 total transfer markers
+           0 total function id markers
+           0 total function return address markers
+           0 total function argument markers
+           0 total function return value markers
+          10 total other markers
+Total windows: 7
+Window #0:
+          12 window \(fetched\) instructions
+          12 window unique \(fetched\) instructions
+           4 window non-fetched instructions
+           0 window prefetches
+           5 window data loads
+           5 window data stores
+           0 window icache flushes
+           0 window dcache flushes
+           3 window scheduling markers
+           0 window transfer markers
+           0 window function id markers
+           0 window function return address markers
+           0 window function argument markers
+           0 window function return value markers
+           4 window other markers
+Window #1:
+           8 window \(fetched\) instructions
+           8 window unique \(fetched\) instructions
+           0 window non-fetched instructions
+           0 window prefetches
+           0 window data loads
+           0 window data stores
+           0 window icache flushes
+           0 window dcache flushes
+           2 window scheduling markers
+           0 window transfer markers
+           0 window function id markers
+           0 window function return address markers
+           0 window function argument markers
+           0 window function return value markers
+           1 window other markers
+Window #2:
+           8 window \(fetched\) instructions
+           8 window unique \(fetched\) instructions
+           0 window non-fetched instructions
+           0 window prefetches
+           0 window data loads
+           0 window data stores
+           0 window icache flushes
+           0 window dcache flushes
+           2 window scheduling markers
+           0 window transfer markers
+           0 window function id markers
+           0 window function return address markers
+           0 window function argument markers
+           0 window function return value markers
+           1 window other markers
+Window #3:
+           8 window \(fetched\) instructions
+           8 window unique \(fetched\) instructions
+           0 window non-fetched instructions
+           0 window prefetches
+           0 window data loads
+           0 window data stores
+           0 window icache flushes
+           0 window dcache flushes
+           2 window scheduling markers
+           0 window transfer markers
+           0 window function id markers
+           0 window function return address markers
+           0 window function argument markers
+           0 window function return value markers
+           1 window other markers
+Window #4:
+           8 window \(fetched\) instructions
+           8 window unique \(fetched\) instructions
+           0 window non-fetched instructions
+           0 window prefetches
+           0 window data loads
+           0 window data stores
+           0 window icache flushes
+           0 window dcache flushes
+           2 window scheduling markers
+           0 window transfer markers
+           0 window function id markers
+           0 window function return address markers
+           0 window function argument markers
+           0 window function return value markers
+           1 window other markers
+Window #5:
+           6 window \(fetched\) instructions
+           6 window unique \(fetched\) instructions
+           0 window non-fetched instructions
+           0 window prefetches
+           0 window data loads
+           0 window data stores
+           0 window icache flushes
+           0 window dcache flushes
+           2 window scheduling markers
+           0 window transfer markers
+           0 window function id markers
+           0 window function return address markers
+           0 window function argument markers
+           0 window function return value markers
+           1 window other markers
+Window #6:
+           0 window \(fetched\) instructions
+           0 window unique \(fetched\) instructions
+           0 window non-fetched instructions
+           0 window prefetches
+           0 window data loads
+           0 window data stores
+           0 window icache flushes
+           0 window dcache flushes
+           1 window scheduling markers
+           0 window transfer markers
+           0 window function id markers
+           0 window function return address markers
+           0 window function argument markers
+           0 window function return value markers
+           1 window other markers
+Thread [0-9]* counts:
+          12 \(fetched\) instructions
+          12 unique \(fetched\) instructions
+           4 non-fetched instructions
+           0 prefetches
+           5 data loads
+           5 data stores
+           0 icache flushes
+           0 dcache flushes
+           3 scheduling markers
+           0 transfer markers
+           0 function id markers
+           0 function return address markers
+           0 function argument markers
+           0 function return value markers
+           4 other markers
diff --git a/clients/drcachesim/tests/offline-windows-invar.templatex b/clients/drcachesim/tests/offline-windows-invar.templatex
new file mode 100644
index 00000000000..24ad612675e
--- /dev/null
+++ b/clients/drcachesim/tests/offline-windows-invar.templatex
@@ -0,0 +1,6 @@
+Hit delay threshold: enabling tracing.
+Hit tracing window #0 limit: disabling tracing.
+Hit retrace threshold: enabling tracing.
+.*
+Hello, world!.*
+Trace invariant checks passed
diff --git a/clients/drcachesim/tests/offline-windows-simple.templatex b/clients/drcachesim/tests/offline-windows-simple.templatex
new file mode 100644
index 00000000000..dc4b5641daf
--- /dev/null
+++ b/clients/drcachesim/tests/offline-windows-simple.templatex
@@ -0,0 +1,14 @@
+Hit delay threshold: enabling tracing.
+Hit tracing window #0 limit: disabling tracing.
+Hit retrace threshold: enabling tracing.
+.*
+Hello, world!.*
+Basic counts tool results:
+.*
+Total windows: [0-9]*
+Window #0:
+.*
+Window #1:
+.*
+Window #2:
+.*
diff --git a/clients/drcachesim/tests/windows-simple.templatex b/clients/drcachesim/tests/windows-simple.templatex
new file mode 100644
index 00000000000..a2b52f19bc4
--- /dev/null
+++ b/clients/drcachesim/tests/windows-simple.templatex
@@ -0,0 +1,15 @@
+Hit delay threshold: enabling tracing.
+Hit tracing window #0 limit: disabling tracing.
+Hit retrace threshold: enabling tracing.
+.*
+Hello, world!.*
+---- <application exited with code 0> ----
+Basic counts tool results:
+.*
+Total windows: [0-9]*
+Window #0:
+.*
+Window #1:
+.*
+Window #2:
+.*
diff --git a/clients/drcachesim/tools/basic_counts.cpp b/clients/drcachesim/tools/basic_counts.cpp
index bcc4464c445..dddfda76c6e 100644
--- a/clients/drcachesim/tools/basic_counts.cpp
+++ b/clients/drcachesim/tools/basic_counts.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2017-2019 Google, Inc.  All rights reserved.
+ * Copyright (c) 2017-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -68,10 +68,10 @@ basic_counts_t::parallel_shard_supported()
 void *
 basic_counts_t::parallel_shard_init(int shard_index, void *worker_data)
 {
-    auto counters = new counters_t;
+    auto per_shard = new per_shard_t;
     std::lock_guard<std::mutex> guard(shard_map_mutex_);
-    shard_map_[shard_index] = counters;
-    return reinterpret_cast<void *>(counters);
+    shard_map_[shard_index] = per_shard;
+    return reinterpret_cast<void *>(per_shard);
 }
 
 bool
@@ -84,14 +84,15 @@ basic_counts_t::parallel_shard_exit(void *shard_data)
 std::string
 basic_counts_t::parallel_shard_error(void *shard_data)
 {
-    counters_t *counters = reinterpret_cast<counters_t *>(shard_data);
-    return counters->error;
+    per_shard_t *per_shard = reinterpret_cast<per_shard_t *>(shard_data);
+    return per_shard->error;
 }
 
 bool
 basic_counts_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
 {
-    counters_t *counters = reinterpret_cast<counters_t *>(shard_data);
+    per_shard_t *per_shard = reinterpret_cast<per_shard_t *>(shard_data);
+    counters_t *counters = &per_shard->counters[per_shard->counters.size() - 1];
     if (type_is_instr(memref.instr.type)) {
         ++counters->instrs;
         counters->unique_pc_addrs.insert(memref.instr.addr);
@@ -111,6 +112,12 @@ basic_counts_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
                    memref.marker.marker_type == TRACE_MARKER_TYPE_KERNEL_XFER) {
             ++counters->xfer_markers;
         } else {
+            if (memref.marker.marker_type == TRACE_MARKER_TYPE_WINDOW_ID &&
+                memref.marker.marker_value != per_shard->last_window) {
+                per_shard->last_window = memref.marker.marker_value;
+                per_shard->counters.resize(per_shard->last_window + 1 /*0-based*/);
+                counters = &per_shard->counters[per_shard->counters.size() - 1];
+            }
             switch (memref.marker.marker_type) {
             case TRACE_MARKER_TYPE_FUNC_ID: ++counters->func_id_markers; break;
             case TRACE_MARKER_TYPE_FUNC_RETADDR: ++counters->func_retaddr_markers; break;
@@ -120,7 +127,7 @@ basic_counts_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
             }
         }
     } else if (memref.data.type == TRACE_TYPE_THREAD_EXIT) {
-        counters->tid = memref.exit.tid;
+        per_shard->tid = memref.exit.tid;
     } else if (memref.data.type == TRACE_TYPE_INSTR_FLUSH) {
         counters->icache_flushes++;
     } else if (memref.data.type == TRACE_TYPE_DATA_FLUSH) {
@@ -132,90 +139,101 @@ basic_counts_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
 bool
 basic_counts_t::process_memref(const memref_t &memref)
 {
-    counters_t *counters;
+    per_shard_t *per_shard;
     const auto &lookup = shard_map_.find(memref.data.tid);
     if (lookup == shard_map_.end()) {
-        counters = new counters_t;
-        shard_map_[memref.data.tid] = counters;
+        per_shard = new per_shard_t;
+        shard_map_[memref.data.tid] = per_shard;
     } else
-        counters = lookup->second;
-    if (!parallel_shard_memref(reinterpret_cast<void *>(counters), memref)) {
-        error_string_ = counters->error;
+        per_shard = lookup->second;
+    if (!parallel_shard_memref(reinterpret_cast<void *>(per_shard), memref)) {
+        error_string_ = per_shard->error;
         return false;
     }
     return true;
 }
 
 bool
-basic_counts_t::cmp_counters(const std::pair<memref_tid_t, counters_t *> &l,
-                             const std::pair<memref_tid_t, counters_t *> &r)
+basic_counts_t::cmp_threads(const std::pair<memref_tid_t, per_shard_t *> &l,
+                            const std::pair<memref_tid_t, per_shard_t *> &r)
 {
-    return (l.second->instrs > r.second->instrs);
+    return (l.second->counters[0].instrs > r.second->counters[0].instrs);
+}
+
+void
+basic_counts_t::print_counters(const counters_t &counters, int_least64_t num_threads,
+                               const std::string &prefix)
+{
+    std::cerr << std::setw(12) << counters.instrs << prefix
+              << " (fetched) instructions\n";
+    std::cerr << std::setw(12) << counters.unique_pc_addrs.size() << prefix
+              << " unique (fetched) instructions\n";
+    std::cerr << std::setw(12) << counters.instrs_nofetch << prefix
+              << " non-fetched instructions\n";
+    std::cerr << std::setw(12) << counters.prefetches << prefix << " prefetches\n";
+    std::cerr << std::setw(12) << counters.loads << prefix << " data loads\n";
+    std::cerr << std::setw(12) << counters.stores << prefix << " data stores\n";
+    std::cerr << std::setw(12) << counters.icache_flushes << prefix
+              << " icache flushes\n";
+    std::cerr << std::setw(12) << counters.dcache_flushes << prefix
+              << " dcache flushes\n";
+    if (num_threads > 0) {
+        std::cerr << std::setw(12) << num_threads << prefix << " threads\n";
+    }
+    std::cerr << std::setw(12) << counters.sched_markers << prefix
+              << " scheduling markers\n";
+    std::cerr << std::setw(12) << counters.xfer_markers << prefix
+              << " transfer markers\n";
+    std::cerr << std::setw(12) << counters.func_id_markers << prefix
+              << " function id markers\n";
+    std::cerr << std::setw(12) << counters.func_retaddr_markers << prefix
+              << " function return address markers\n";
+    std::cerr << std::setw(12) << counters.func_arg_markers << prefix
+              << " function argument markers\n";
+    std::cerr << std::setw(12) << counters.func_retval_markers << prefix
+              << " function return value markers\n";
+    std::cerr << std::setw(12) << counters.other_markers << prefix << " other markers\n";
 }
 
 bool
 basic_counts_t::print_results()
 {
     counters_t total;
+    uintptr_t num_windows = 1;
     for (const auto &shard : shard_map_) {
-        total += *shard.second;
+        num_windows = std::max(num_windows, shard.second->counters.size());
+    }
+    for (const auto &shard : shard_map_) {
+        for (const auto &ctr : shard.second->counters) {
+            total += ctr;
+        }
     }
     std::cerr << TOOL_NAME << " results:\n";
     std::cerr << "Total counts:\n";
-    std::cerr << std::setw(12) << total.instrs << " total (fetched) instructions\n";
-    std::cerr << std::setw(12) << total.unique_pc_addrs.size()
-              << " total unique (fetched) instructions\n";
-    std::cerr << std::setw(12) << total.instrs_nofetch
-              << " total non-fetched instructions\n";
-    std::cerr << std::setw(12) << total.prefetches << " total prefetches\n";
-    std::cerr << std::setw(12) << total.loads << " total data loads\n";
-    std::cerr << std::setw(12) << total.stores << " total data stores\n";
-    std::cerr << std::setw(12) << total.icache_flushes << " total icache flushes\n";
-    std::cerr << std::setw(12) << total.dcache_flushes << " total dcache flushes\n";
-    std::cerr << std::setw(12) << shard_map_.size() << " total threads\n";
-    std::cerr << std::setw(12) << total.sched_markers << " total scheduling markers\n";
-    std::cerr << std::setw(12) << total.xfer_markers << " total transfer markers\n";
-    std::cerr << std::setw(12) << total.func_id_markers << " total function id markers\n";
-    std::cerr << std::setw(12) << total.func_retaddr_markers
-              << " total function return address markers\n";
-    std::cerr << std::setw(12) << total.func_arg_markers
-              << " total function argument markers\n";
-    std::cerr << std::setw(12) << total.func_retval_markers
-              << " total function return value markers\n";
-    std::cerr << std::setw(12) << total.other_markers << " total other markers\n";
+    print_counters(total, shard_map_.size(), " total");
+
+    if (num_windows > 1) {
+        std::cerr << "Total windows: " << num_windows << "\n";
+        for (uintptr_t i = 0; i < num_windows; ++i) {
+            std::cerr << "Window #" << i << ":\n";
+            for (const auto &shard : shard_map_) {
+                if (shard.second->counters.size() > i) {
+                    print_counters(shard.second->counters[i], 0, " window");
+                }
+            }
+        }
+    }
 
     // Print the threads sorted by instrs.
-    std::vector<std::pair<memref_tid_t, counters_t *>> sorted(shard_map_.begin(),
-                                                              shard_map_.end());
-    std::sort(sorted.begin(), sorted.end(), cmp_counters);
+    std::vector<std::pair<memref_tid_t, per_shard_t *>> sorted(shard_map_.begin(),
+                                                               shard_map_.end());
+    std::sort(sorted.begin(), sorted.end(), cmp_threads);
     for (const auto &keyvals : sorted) {
         std::cerr << "Thread " << keyvals.second->tid << " counts:\n";
-        std::cerr << std::setw(12) << keyvals.second->instrs
-                  << " (fetched) instructions\n";
-        std::cerr << std::setw(12) << keyvals.second->unique_pc_addrs.size()
-                  << " unique (fetched) instructions\n";
-        std::cerr << std::setw(12) << keyvals.second->instrs_nofetch
-                  << " non-fetched instructions\n";
-        std::cerr << std::setw(12) << keyvals.second->prefetches << " prefetches\n";
-        std::cerr << std::setw(12) << keyvals.second->loads << " data loads\n";
-        std::cerr << std::setw(12) << keyvals.second->stores << " data stores\n";
-        std::cerr << std::setw(12) << keyvals.second->icache_flushes
-                  << " icache flushes\n";
-        std::cerr << std::setw(12) << keyvals.second->dcache_flushes
-                  << " dcache flushes\n";
-        std::cerr << std::setw(12) << keyvals.second->sched_markers
-                  << " scheduling markers\n";
-        std::cerr << std::setw(12) << keyvals.second->xfer_markers
-                  << " transfer markers\n";
-        std::cerr << std::setw(12) << keyvals.second->func_id_markers
-                  << " function id markers\n";
-        std::cerr << std::setw(12) << keyvals.second->func_retaddr_markers
-                  << " function return address markers\n";
-        std::cerr << std::setw(12) << keyvals.second->func_arg_markers
-                  << " function argument markers\n";
-        std::cerr << std::setw(12) << keyvals.second->func_retval_markers
-                  << " function return value markers\n";
-        std::cerr << std::setw(12) << keyvals.second->other_markers << " other markers\n";
+        print_counters(keyvals.second->counters[0], 0, "");
     }
+
+    // TODO i#3599: also print thread-per-window stats.
+
     return true;
 }
diff --git a/clients/drcachesim/tools/basic_counts.h b/clients/drcachesim/tools/basic_counts.h
index 7d950fb9eee..3ef5f43f90e 100644
--- a/clients/drcachesim/tools/basic_counts.h
+++ b/clients/drcachesim/tools/basic_counts.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2017-2020 Google, Inc.  All rights reserved.
+ * Copyright (c) 2017-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -65,7 +65,7 @@ class basic_counts_t : public analysis_tool_t {
         {
         }
         counters_t &
-        operator+=(counters_t &rhs)
+        operator+=(const counters_t &rhs)
         {
             instrs += rhs.instrs;
             instrs_nofetch += rhs.instrs_nofetch;
@@ -86,7 +86,6 @@ class basic_counts_t : public analysis_tool_t {
             }
             return *this;
         }
-        memref_tid_t tid = 0;
         int_least64_t instrs = 0;
         int_least64_t instrs_nofetch = 0;
         int_least64_t prefetches = 0;
@@ -102,14 +101,28 @@ class basic_counts_t : public analysis_tool_t {
         int_least64_t icache_flushes = 0;
         int_least64_t dcache_flushes = 0;
         std::unordered_set<uint64_t> unique_pc_addrs;
+    };
+    struct per_shard_t {
+        per_shard_t()
+        {
+            counters.resize(1);
+        }
+        memref_tid_t tid = 0;
+        // A vector to support windows.
+        std::vector<counters_t> counters;
         std::string error;
+        uintptr_t last_window = 0;
     };
+
     static bool
-    cmp_counters(const std::pair<memref_tid_t, counters_t *> &l,
-                 const std::pair<memref_tid_t, counters_t *> &r);
+    cmp_threads(const std::pair<memref_tid_t, per_shard_t *> &l,
+                const std::pair<memref_tid_t, per_shard_t *> &r);
+    static void
+    print_counters(const counters_t &counters, int_least64_t num_threads,
+                   const std::string &prefix);
 
     // The keys here are int for parallel, tid for serial.
-    std::unordered_map<memref_tid_t, counters_t *> shard_map_;
+    std::unordered_map<memref_tid_t, per_shard_t *> shard_map_;
     // This mutex is only needed in parallel_shard_init.  In all other accesses to
     // shard_map (process_memref, print_results) we are single-threaded.
     std::mutex shard_map_mutex_;
diff --git a/clients/drcachesim/tools/invariant_checker.cpp b/clients/drcachesim/tools/invariant_checker.cpp
index ac0ebd81567..b279c365c47 100644
--- a/clients/drcachesim/tools/invariant_checker.cpp
+++ b/clients/drcachesim/tools/invariant_checker.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2017-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2017-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -257,6 +257,8 @@ invariant_checker_t::parallel_shard_memref(void *shard_data, const memref_t &mem
                           TRACE_MARKER_TYPE_KERNEL_EVENT ||
                       shard->prev_xfer_marker_.marker.marker_type ==
                           TRACE_MARKER_TYPE_KERNEL_XFER)) ||
+                    // We expect a gap on a window transition.
+                    shard->window_transition_ ||
                     shard->prev_instr_.instr.type == TRACE_TYPE_INSTR_SYSENTER,
                 "Non-explicit control flow has no marker");
             // XXX: If we had instr decoding we could check direct branch targets
@@ -302,6 +304,8 @@ invariant_checker_t::parallel_shard_memref(void *shard_data, const memref_t &mem
         // prior (i#3937).
         shard->prev_xfer_marker_.marker.marker_type = TRACE_MARKER_TYPE_VERSION;
         shard->saw_timestamp_but_no_instr_ = false;
+        // Clear window transitions on instrs.
+        shard->window_transition_ = false;
     } else if (knob_verbose_ >= 3) {
         std::cerr << "::" << memref.data.pid << ":" << memref.data.tid << ":: "
                   << " type " << memref.instr.type << "\n";
@@ -315,7 +319,7 @@ invariant_checker_t::parallel_shard_memref(void *shard_data, const memref_t &mem
         }
     }
     if (memref.marker.type == TRACE_TYPE_MARKER &&
-        // Ignore timestamp, etc. markers which show up a signal delivery boundaries
+        // Ignore timestamp, etc. markers which show up at signal delivery boundaries
         // b/c the tracer does a buffer flush there.
         (memref.marker.marker_type == TRACE_MARKER_TYPE_KERNEL_EVENT ||
          memref.marker.marker_type == TRACE_MARKER_TYPE_KERNEL_XFER)) {
@@ -337,6 +341,12 @@ invariant_checker_t::parallel_shard_memref(void *shard_data, const memref_t &mem
         shard->prev_xfer_marker_ = memref;
         shard->last_xfer_marker_ = memref;
     }
+    if (memref.marker.type == TRACE_TYPE_MARKER &&
+        memref.marker.marker_type == TRACE_MARKER_TYPE_WINDOW_ID) {
+        if (shard->last_window_ != memref.marker.marker_value)
+            shard->window_transition_ = true;
+        shard->last_window_ = memref.marker.marker_value;
+    }
 
 #ifdef UNIX
     shard->prev_prev_entry_ = shard->prev_entry_;
diff --git a/clients/drcachesim/tools/invariant_checker.h b/clients/drcachesim/tools/invariant_checker.h
index 2bffc85bff4..27d967d4517 100644
--- a/clients/drcachesim/tools/invariant_checker.h
+++ b/clients/drcachesim/tools/invariant_checker.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2016-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2016-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -99,6 +99,8 @@ class invariant_checker_t : public analysis_tool_t {
         // operation.
         addr_t app_handler_pc_ = 0;
         offline_file_type_t file_type_ = OFFLINE_FILE_TYPE_DEFAULT;
+        uintptr_t last_window_ = 0;
+        bool window_transition_ = false;
     };
 
     // We provide this for subclasses to run these invariants with custom
diff --git a/clients/drcachesim/tools/view.cpp b/clients/drcachesim/tools/view.cpp
index d61a461f386..833eef8df23 100644
--- a/clients/drcachesim/tools/view.cpp
+++ b/clients/drcachesim/tools/view.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2017-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2017-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -71,6 +71,7 @@ view_t::view_t(const std::string &module_file_path, memref_tid_t thread,
     , prev_tid_(-1)
     , filetype_(-1)
     , num_refs_(0)
+    , timestamp_(0)
 {
 }
 
@@ -190,6 +191,13 @@ view_t::process_memref(const memref_t &memref)
                 return false;
             }
             return true; // Do not count toward -sim_refs yet b/c we don't have tid.
+        case TRACE_MARKER_TYPE_TIMESTAMP:
+            // Delay to see whether this is a new window.  We assume a timestamp
+            // is always followed by another marker (cpu or window).
+            // We can't easily reorder and place window markers before timestamps
+            // since memref iterators use the timestamps to order buffer units.
+            timestamp_ = memref.marker.marker_value;
+            return true;
         default: break;
         }
     }
@@ -231,7 +239,7 @@ view_t::process_memref(const memref_t &memref)
             // Handled above.
             break;
         case TRACE_MARKER_TYPE_TIMESTAMP:
-            std::cerr << "<marker: timestamp " << memref.marker.marker_value << ">\n";
+            // Handled above.
             break;
         case TRACE_MARKER_TYPE_CPU_ID:
             // We include the thread ID here under the assumption that we will always
@@ -273,11 +281,34 @@ view_t::process_memref(const memref_t &memref)
             std::cerr << "<marker: cache line size " << memref.marker.marker_value
                       << ">\n";
             break;
+        case TRACE_MARKER_TYPE_WINDOW_ID:
+            if (last_window_[memref.marker.tid] != memref.marker.marker_value) {
+                std::cerr
+                    << "------------------------------------------------------------\n";
+                print_prefix(memref);
+            }
+            if (timestamp_ > 0) {
+                if (!should_skip()) {
+                    std::cerr << "<marker: timestamp " << timestamp_ << ">\n";
+                    timestamp_ = 0;
+                    print_prefix(memref);
+                }
+            }
+            std::cerr << "<marker: window " << memref.marker.marker_value << ">\n";
+            last_window_[memref.marker.tid] = memref.marker.marker_value;
+            break;
         default:
             std::cerr << "<marker: type " << memref.marker.marker_type << "; value "
                       << memref.marker.marker_value << ">\n";
             break;
         }
+        if (timestamp_ > 0) {
+            if (!should_skip()) {
+                print_prefix(memref);
+                std::cerr << "<marker: timestamp " << timestamp_ << ">\n";
+                timestamp_ = 0;
+            }
+        }
         return true;
     }
 
diff --git a/clients/drcachesim/tools/view.h b/clients/drcachesim/tools/view.h
index 6b4edd8134b..bd49ebd62dd 100644
--- a/clients/drcachesim/tools/view.h
+++ b/clients/drcachesim/tools/view.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2018-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2018-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -112,6 +112,8 @@ class view_t : public analysis_tool_t {
     intptr_t filetype_;
     std::unordered_set<memref_tid_t> printed_header_;
     uint64_t num_refs_;
+    std::unordered_map<memref_tid_t, uintptr_t> last_window_;
+    uintptr_t timestamp_;
 };
 
 #endif /* _VIEW_H_ */
diff --git a/clients/drcachesim/tracer/instru.h b/clients/drcachesim/tracer/instru.h
index 9879983189f..9ee287fafad 100644
--- a/clients/drcachesim/tracer/instru.h
+++ b/clients/drcachesim/tracer/instru.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2016-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2016-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -196,6 +196,8 @@ class instru_t {
     get_entry_type(byte *buf_ptr) const = 0;
     virtual size_t
     get_entry_size(byte *buf_ptr) const = 0;
+    virtual int
+    get_instr_count(byte *buf_ptr) const = 0;
     virtual addr_t
     get_entry_addr(byte *buf_ptr) const = 0;
     virtual void
@@ -217,7 +219,7 @@ class instru_t {
     append_thread_header(byte *buf_ptr, thread_id_t tid) = 0;
     // This is a per-buffer-writeout header.
     virtual int
-    append_unit_header(byte *buf_ptr, thread_id_t tid) = 0;
+    append_unit_header(byte *buf_ptr, thread_id_t tid, ptr_int_t window) = 0;
     virtual void
     set_frozen_timestamp(uint64 timestamp)
     {
@@ -307,6 +309,8 @@ class online_instru_t : public instru_t {
     get_entry_type(byte *buf_ptr) const override;
     size_t
     get_entry_size(byte *buf_ptr) const override;
+    int
+    get_instr_count(byte *buf_ptr) const override;
     addr_t
     get_entry_addr(byte *buf_ptr) const override;
     void
@@ -327,7 +331,7 @@ class online_instru_t : public instru_t {
     virtual int
     append_thread_header(byte *buf_ptr, thread_id_t tid, offline_file_type_t file_type);
     int
-    append_unit_header(byte *buf_ptr, thread_id_t tid) override;
+    append_unit_header(byte *buf_ptr, thread_id_t tid, ptr_int_t window) override;
 
     int
     instrument_memref(void *drcontext, instrlist_t *ilist, instr_t *where,
@@ -375,6 +379,8 @@ class offline_instru_t : public instru_t {
     get_entry_type(byte *buf_ptr) const override;
     size_t
     get_entry_size(byte *buf_ptr) const override;
+    int
+    get_instr_count(byte *buf_ptr) const override;
     addr_t
     get_entry_addr(byte *buf_ptr) const override;
     void
@@ -398,7 +404,7 @@ class offline_instru_t : public instru_t {
     virtual int
     append_thread_header(byte *buf_ptr, thread_id_t tid, offline_file_type_t file_type);
     int
-    append_unit_header(byte *buf_ptr, thread_id_t tid) override;
+    append_unit_header(byte *buf_ptr, thread_id_t tid, ptr_int_t window) override;
 
     int
     instrument_memref(void *drcontext, instrlist_t *ilist, instr_t *where,
diff --git a/clients/drcachesim/tracer/instru_offline.cpp b/clients/drcachesim/tracer/instru_offline.cpp
index 2c7438ee489..a8411b8c71b 100644
--- a/clients/drcachesim/tracer/instru_offline.cpp
+++ b/clients/drcachesim/tracer/instru_offline.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2016-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2016-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -235,6 +235,18 @@ offline_instru_t::get_entry_size(byte *buf_ptr) const
     return 0;
 }
 
+int
+offline_instru_t::get_instr_count(byte *buf_ptr) const
+{
+    offline_entry_t *entry = (offline_entry_t *)buf_ptr;
+    if (entry->addr.type != OFFLINE_TYPE_PC)
+        return 0;
+    // TODO i#3995: We should *not* count "non-fetched" instrs so we'll match
+    // hardware performance counters.
+    // Xref i#4948 and i#4915 on getting rid of "non-fetched" instrs.
+    return entry->pc.instr_count;
+}
+
 addr_t
 offline_instru_t::get_entry_addr(byte *buf_ptr) const
 {
@@ -344,7 +356,7 @@ offline_instru_t::append_thread_header(byte *buf_ptr, thread_id_t tid)
 }
 
 int
-offline_instru_t::append_unit_header(byte *buf_ptr, thread_id_t tid)
+offline_instru_t::append_unit_header(byte *buf_ptr, thread_id_t tid, ptr_int_t window)
 {
     byte *new_buf = buf_ptr;
     offline_entry_t *entry = (offline_entry_t *)new_buf;
@@ -352,6 +364,8 @@ offline_instru_t::append_unit_header(byte *buf_ptr, thread_id_t tid)
     entry->timestamp.usec =
         frozen_timestamp_ != 0 ? frozen_timestamp_ : instru_t::get_timestamp();
     new_buf += sizeof(*entry);
+    if (window >= 0)
+        new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_WINDOW_ID, (uintptr_t)window);
     new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_CPU_ID, instru_t::get_cpu_id());
     return (int)(new_buf - buf_ptr);
 }
diff --git a/clients/drcachesim/tracer/instru_online.cpp b/clients/drcachesim/tracer/instru_online.cpp
index 8428c855932..8c52e62d8ab 100644
--- a/clients/drcachesim/tracer/instru_online.cpp
+++ b/clients/drcachesim/tracer/instru_online.cpp
@@ -72,6 +72,20 @@ online_instru_t::get_entry_size(byte *buf_ptr) const
     return entry->size;
 }
 
+int
+online_instru_t::get_instr_count(byte *buf_ptr) const
+{
+    trace_entry_t *entry = (trace_entry_t *)buf_ptr;
+    if (!type_is_instr((trace_type_t)entry->type))
+        return 0;
+    // TODO i#3995: We should *not* count "non-fetched" instrs so we'll match
+    // hardware performance counters.
+    // Xref i#4948 and i#4915 on getting rid of "non-fetched" instrs.
+    if (entry->type == TRACE_TYPE_INSTR_BUNDLE)
+        return entry->size;
+    return 1;
+}
+
 addr_t
 online_instru_t::get_entry_addr(byte *buf_ptr) const
 {
@@ -165,7 +179,7 @@ online_instru_t::append_thread_header(byte *buf_ptr, thread_id_t tid)
 }
 
 int
-online_instru_t::append_unit_header(byte *buf_ptr, thread_id_t tid)
+online_instru_t::append_unit_header(byte *buf_ptr, thread_id_t tid, intptr_t window)
 {
     byte *new_buf = buf_ptr;
     new_buf += append_tid(new_buf, tid);
@@ -174,6 +188,8 @@ online_instru_t::append_unit_header(byte *buf_ptr, thread_id_t tid)
                              static_cast<uintptr_t>(frozen_timestamp_ != 0
                                                         ? frozen_timestamp_
                                                         : instru_t::get_timestamp()));
+    if (window >= 0)
+        new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_WINDOW_ID, (uintptr_t)window);
     new_buf += append_marker(new_buf, TRACE_MARKER_TYPE_CPU_ID, instru_t::get_cpu_id());
     return (int)(new_buf - buf_ptr);
 }
diff --git a/clients/drcachesim/tracer/raw2trace.cpp b/clients/drcachesim/tracer/raw2trace.cpp
index fad224e8749..793c4f774ad 100644
--- a/clients/drcachesim/tracer/raw2trace.cpp
+++ b/clients/drcachesim/tracer/raw2trace.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2016-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2016-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -594,18 +594,24 @@ raw2trace_t::process_next_thread_buffer(raw2trace_thread_data_t *tdata,
             }
             continue;
         }
-        // Append delayed branches at the end or before xfer markers; else, delay
-        // until we see a non-cti inside a block, to handle double branches (i#5141)
-        // and to group all (non-xfer) markers with a new timestamp.
+        // Append delayed branches at the end or before xfer or window-change
+        // markers; else, delay until we see a non-cti inside a block, to handle
+        // double branches (i#5141) and to group all (non-xfer) markers with a new
+        // timestamp.
         if (entry.extended.type == OFFLINE_TYPE_EXTENDED &&
             (entry.extended.ext == OFFLINE_EXT_TYPE_FOOTER ||
              (entry.extended.ext == OFFLINE_EXT_TYPE_MARKER &&
-              (entry.extended.valueB == TRACE_MARKER_TYPE_KERNEL_EVENT ||
-               entry.extended.valueB == TRACE_MARKER_TYPE_KERNEL_XFER)))) {
+              ((entry.extended.valueB == TRACE_MARKER_TYPE_KERNEL_EVENT ||
+                entry.extended.valueB == TRACE_MARKER_TYPE_KERNEL_XFER) ||
+               (entry.extended.valueB == TRACE_MARKER_TYPE_WINDOW_ID &&
+                entry.extended.valueA != tdata->last_window))))) {
             tdata->error = append_delayed_branch(tdata);
             if (!tdata->error.empty())
                 return tdata->error;
         }
+        if (entry.extended.ext == OFFLINE_EXT_TYPE_MARKER &&
+            entry.extended.valueB == TRACE_MARKER_TYPE_WINDOW_ID)
+            tdata->last_window = entry.extended.valueA;
         tdata->error = process_offline_entry(tdata, &entry, tdata->tid, end_of_record,
                                              &last_bb_handled);
         if (!tdata->error.empty())
diff --git a/clients/drcachesim/tracer/raw2trace.h b/clients/drcachesim/tracer/raw2trace.h
index 99129c9b94d..de2d2bb66ed 100644
--- a/clients/drcachesim/tracer/raw2trace.h
+++ b/clients/drcachesim/tracer/raw2trace.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2016-2021 Google, Inc.  All rights reserved.
+ * Copyright (c) 2016-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -1665,6 +1665,7 @@ class raw2trace_t : public trace_converter_t<raw2trace_t> {
         bool prev_instr_was_rep_string;
         app_pc last_decode_block_start;
         block_summary_t *last_block_summary;
+        uint64 last_window = 0;
 
         // Statistics on the processing.
         uint64 count_elided = 0;
diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp
index a91e1128db0..6128455b9ba 100644
--- a/clients/drcachesim/tracer/tracer.cpp
+++ b/clients/drcachesim/tracer/tracer.cpp
@@ -123,6 +123,7 @@ typedef struct {
     byte *buf_base;
     uint64 num_refs;
     uint64 bytes_written;
+    uint64 instr_count;
     /* For offline traces */
     file_t file;
     size_t init_header_size;
@@ -189,6 +190,9 @@ enum {
      * on the transition.
      */
     MEMTRACE_TLS_OFFS_ICOUNTDOWN,
+    // For has_tracing_windows(), this is the ordinal of the tracing window at
+    // the start of the current trace buffer.  It is -1 if windows are not present.
+    MEMTRACE_TLS_OFFS_WINDOW,
     MEMTRACE_TLS_COUNT, /* total number of TLS slots allocated */
 };
 static reg_id_t tls_seg;
@@ -204,6 +208,50 @@ static bool (*should_trace_thread_cb)(thread_id_t tid, void *user_data);
 static void *trace_thread_cb_user_data;
 static bool thread_filtering_enabled;
 
+/* Similarly to -trace_after_instrs, we use thread-local counters to avoid
+ * synchronization costs and only add to the global every N counts.
+ */
+#define INSTR_COUNT_LOCAL_UNIT 10000
+static std::atomic<uint64> traced_instr_count;
+
+static bool
+count_traced_instrs(void *drcontext, int toadd);
+
+static void
+reached_traced_instrs_threshold(void *drcontext);
+
+static uint64
+local_instr_count_threshold()
+{
+    uint64 limit = op_trace_for_instrs.get_value();
+    if (limit > INSTR_COUNT_LOCAL_UNIT * 10)
+        return INSTR_COUNT_LOCAL_UNIT;
+    else {
+        /* For small windows, use a smaller add-to-global trigger. */
+        return limit / 10;
+    }
+}
+
+static bool
+has_tracing_windows()
+{
+    return op_trace_for_instrs.get_value() > 0 || op_retrace_every_instrs.get_value() > 0;
+}
+
+static void
+set_local_window(per_thread_t *data, ptr_int_t value)
+{
+    *(ptr_int_t *)TLS_SLOT(data->seg_base, MEMTRACE_TLS_OFFS_WINDOW) = value;
+}
+
+static ptr_int_t
+get_local_window(per_thread_t *data)
+{
+    return *(ptr_int_t *)TLS_SLOT(data->seg_base, MEMTRACE_TLS_OFFS_WINDOW);
+}
+
+static std::atomic<ptr_int_t> tracing_window;
+
 /***************************************************************************
  * Buffer writing to disk.
  */
@@ -363,9 +411,9 @@ create_buffer(per_thread_t *data)
 }
 
 static int
-append_unit_header(void *drcontext, byte *buf_ptr, thread_id_t tid)
+append_unit_header(void *drcontext, byte *buf_ptr, thread_id_t tid, ptr_int_t window)
 {
-    int size_added = instru->append_unit_header(buf_ptr, tid);
+    int size_added = instru->append_unit_header(buf_ptr, tid, window);
     if (op_L0_filter.get_value()) {
         // Include the instruction count.
         // It might be useful to include the count with each miss as well, but
@@ -389,7 +437,7 @@ append_unit_header(void *drcontext, byte *buf_ptr, thread_id_t tid)
 }
 
 static inline byte *
-atomic_pipe_write(void *drcontext, byte *pipe_start, byte *pipe_end)
+atomic_pipe_write(void *drcontext, byte *pipe_start, byte *pipe_end, ptr_int_t window)
 {
     ssize_t towrite = pipe_end - pipe_start;
     DR_ASSERT(towrite <= ipc_pipe.get_atomic_write_size() && towrite > 0);
@@ -399,13 +447,14 @@ atomic_pipe_write(void *drcontext, byte *pipe_start, byte *pipe_end)
     // Re-emit buffer unit header to handle split pipe writes.
     if (pipe_end - buf_hdr_slots_size > pipe_start) {
         pipe_start = pipe_end - buf_hdr_slots_size;
-        append_unit_header(drcontext, pipe_start, dr_get_thread_id(drcontext));
+        append_unit_header(drcontext, pipe_start, dr_get_thread_id(drcontext), window);
     }
     return pipe_start;
 }
 
 static inline byte *
-write_trace_data(void *drcontext, byte *towrite_start, byte *towrite_end)
+write_trace_data(void *drcontext, byte *towrite_start, byte *towrite_end,
+                 ptr_int_t window)
 {
     if (op_offline.get_value()) {
         per_thread_t *data = (per_thread_t *)drmgr_get_tls_field(drcontext, tls_idx);
@@ -420,7 +469,7 @@ write_trace_data(void *drcontext, byte *towrite_start, byte *towrite_end)
         }
         return towrite_start;
     } else
-        return atomic_pipe_write(drcontext, towrite_start, towrite_end);
+        return atomic_pipe_write(drcontext, towrite_start, towrite_end, window);
 }
 
 static bool
@@ -460,11 +509,24 @@ memtrace(void *drcontext, bool skip_size_cap)
     if (data->num_refs == 0 && op_offline.get_value())
         header_size = data->init_header_size;
     // We may get called with nothing to write: e.g., on a syscall for -L0_filter.
-    if (buf_ptr == data->buf_base + header_size + buf_hdr_slots_size)
+    if (buf_ptr == data->buf_base + header_size + buf_hdr_slots_size) {
+        if (has_tracing_windows())
+            set_local_window(data, tracing_window.load(std::memory_order_acquire));
         return;
+    }
     // The initial slots are left empty for the header, which we add here.
-    header_size += append_unit_header(drcontext, data->buf_base + header_size,
-                                      dr_get_thread_id(drcontext));
+    header_size +=
+        append_unit_header(drcontext, data->buf_base + header_size,
+                           dr_get_thread_id(drcontext), get_local_window(data));
+    bool window_changed = false;
+    if (has_tracing_windows() &&
+        get_local_window(data) != tracing_window.load(std::memory_order_acquire)) {
+        // This buffer is for a prior window.  Do not add to the current window count.
+        data->instr_count = 0;
+        window_changed = true;
+        // No need to append TRACE_MARKER_TYPE_WINDOW_ID: the next buffer will have
+        // one in its header.
+    }
     pipe_start = data->buf_base;
     pipe_end = pipe_start;
     if (!skip_size_cap &&
@@ -477,6 +539,7 @@ memtrace(void *drcontext, bool skip_size_cap)
         if (is_num_refs_beyond_global_max()) {
             /* std::atomic *should* be safe (we can assert std::atomic_is_lock_free())
              * but to avoid any risk we use DR's atomics.
+             * Update: we are now using std::atomic for some new variables.
              */
             if (dr_atomic_load32(&notify_beyond_global_max_once) == 0) {
                 int count = dr_atomic_add32_return_sum(&notify_beyond_global_max_once, 1);
@@ -496,11 +559,25 @@ memtrace(void *drcontext, bool skip_size_cap)
         data->bytes_written += buf_ptr - pipe_start;
 
     if (do_write) {
-        if (have_phys && op_use_physical.get_value()) {
+        bool hit_window_end = false;
+        if ((have_phys && op_use_physical.get_value()) ||
+            op_trace_for_instrs.get_value() > 0) {
             for (mem_ref = data->buf_base + header_size; mem_ref < buf_ptr;
                  mem_ref += instru->sizeof_entry()) {
+                if (!window_changed && !hit_window_end &&
+                    op_trace_for_instrs.get_value() > 0) {
+                    hit_window_end =
+                        count_traced_instrs(drcontext, instru->get_instr_count(mem_ref));
+                    // We have to finish this buffer so we'll go a little beyond the
+                    // precise requested window length.
+                    // XXX: For small windows this may be significant: we could go
+                    // ~5K beyond if we hit the threshold near the start of a full buffer.
+                    // Should we discard the rest of the entries in such a case, at
+                    // a block boundary, even though we already collected them?
+                }
                 trace_type_t type = instru->get_entry_type(mem_ref);
-                if (type != TRACE_TYPE_THREAD && type != TRACE_TYPE_THREAD_EXIT &&
+                if (have_phys && op_use_physical.get_value() &&
+                    type != TRACE_TYPE_THREAD && type != TRACE_TYPE_THREAD_EXIT &&
                     type != TRACE_TYPE_PID) {
                     addr_t virt = instru->get_entry_addr(mem_ref);
                     addr_t phys = physaddr.virtual2physical(virt);
@@ -519,6 +596,8 @@ memtrace(void *drcontext, bool skip_size_cap)
                     }
                 }
             }
+            if (hit_window_end)
+                reached_traced_instrs_threshold(drcontext);
         }
         if (!op_offline.get_value()) {
             for (mem_ref = data->buf_base + header_size; mem_ref < buf_ptr;
@@ -538,7 +617,8 @@ memtrace(void *drcontext, bool skip_size_cap)
                          pipe_start) > ipc_pipe.get_atomic_write_size()) {
                         DR_ASSERT(is_ok_to_split_before(
                             instru->get_entry_type(pipe_start + header_size)));
-                        pipe_start = atomic_pipe_write(drcontext, pipe_start, pipe_end);
+                        pipe_start = atomic_pipe_write(drcontext, pipe_start, pipe_end,
+                                                       get_local_window(data));
                     }
                 }
             }
@@ -551,15 +631,16 @@ memtrace(void *drcontext, bool skip_size_cap)
             if ((buf_ptr - pipe_start) > ipc_pipe.get_atomic_write_size()) {
                 DR_ASSERT(is_ok_to_split_before(
                     instru->get_entry_type(pipe_start + header_size)));
-                pipe_start = atomic_pipe_write(drcontext, pipe_start, pipe_end);
+                pipe_start = atomic_pipe_write(drcontext, pipe_start, pipe_end,
+                                               get_local_window(data));
             }
             if ((buf_ptr - pipe_start) > (ssize_t)buf_hdr_slots_size) {
                 DR_ASSERT(is_ok_to_split_before(
                     instru->get_entry_type(pipe_start + header_size)));
-                atomic_pipe_write(drcontext, pipe_start, buf_ptr);
+                atomic_pipe_write(drcontext, pipe_start, buf_ptr, get_local_window(data));
             }
         } else {
-            write_trace_data(drcontext, pipe_start, buf_ptr);
+            write_trace_data(drcontext, pipe_start, buf_ptr, get_local_window(data));
         }
         auto span = buf_ptr - (data->buf_base + header_size);
         DR_ASSERT(span % instru->sizeof_entry() == 0);
@@ -598,6 +679,8 @@ memtrace(void *drcontext, bool skip_size_cap)
         }
         dr_mutex_unlock(mutex);
     }
+    if (has_tracing_windows())
+        set_local_window(data, tracing_window.load(std::memory_order_acquire));
 }
 
 /* clean_call sends the memory reference info to the simulator */
@@ -669,12 +752,61 @@ event_pre_syscall(void *drcontext, int sysnum);
 static void
 event_kernel_xfer(void *drcontext, const dr_kernel_xfer_info_t *info);
 
+// Returns whether we've reached the end of this tracing window.
+static bool
+count_traced_instrs(void *drcontext, int toadd)
+{
+    per_thread_t *data = (per_thread_t *)drmgr_get_tls_field(drcontext, tls_idx);
+    data->instr_count += toadd;
+    if (data->instr_count >= local_instr_count_threshold()) {
+        uint64 newval =
+            traced_instr_count.fetch_add(data->instr_count, std::memory_order_release) +
+            // fetch_add returns old value.
+            data->instr_count;
+        data->instr_count = 0;
+        if (newval >= op_trace_for_instrs.get_value())
+            return true;
+    }
+    return false;
+}
+
+static void
+reached_traced_instrs_threshold(void *drcontext)
+{
+    per_thread_t *data = (per_thread_t *)drmgr_get_tls_field(drcontext, tls_idx);
+    dr_mutex_lock(mutex);
+    if (get_local_window(data) != tracing_window.load(std::memory_order_acquire)) {
+        // Another thread already changed the mode.
+        dr_mutex_unlock(mutex);
+        return;
+    }
+    // We've reached the end of our window.
+    // We do not attempt a proactive synchronous flush of other threads'
+    // buffers, relying on our end-of-block check for a mode change.
+    // (If -trace_every_instrs is not set and we're not going to trace
+    // again, we still use a counting mode for simplicity of not adding
+    // yet another mode.)
+    NOTIFY(0, "Hit tracing window #%zd limit: disabling tracing.\n",
+           tracing_window.load(std::memory_order_acquire));
+    // No need to append TRACE_MARKER_TYPE_WINDOW_ID: the next buffer will have
+    // one in its header.
+    // If we're counting at exit time, this increment means that the thread
+    // exit entries will be the only ones in this new window: but that seems
+    // reasonable.
+    tracing_window.fetch_add(1, std::memory_order_release);
+    DR_ASSERT(tracing_disabled.load(std::memory_order_acquire) == BBDUP_MODE_TRACE);
+    tracing_disabled.store(BBDUP_MODE_COUNT, std::memory_order_release);
+    traced_instr_count.store(0, std::memory_order_release);
+    dr_mutex_unlock(mutex);
+}
+
 static uintptr_t
 event_bb_setup(void *drbbdup_ctx, void *drcontext, void *tag, instrlist_t *bb,
                bool *enable_dups, bool *enable_dynamic_handling, void *user_data)
 {
     DR_ASSERT(enable_dups != NULL && enable_dynamic_handling != NULL);
-    if (op_trace_after_instrs.get_value() > 0) {
+    if (op_trace_after_instrs.get_value() > 0 || op_trace_for_instrs.get_value() > 0 ||
+        op_retrace_every_instrs.get_value() > 0) {
         *enable_dups = true;
         drbbdup_status_t res =
             drbbdup_register_case_encoding(drbbdup_ctx, BBDUP_MODE_COUNT);
@@ -785,6 +917,7 @@ instrumentation_drbbdup_init()
     DR_ASSERT(res == DRBBDUP_SUCCESS);
     /* We just want barriers and atomic ops: no locks b/c they are not safe. */
     DR_ASSERT(tracing_disabled.is_lock_free());
+    DR_ASSERT(traced_instr_count.is_lock_free());
 }
 
 static void
@@ -879,19 +1012,19 @@ instrument_delay_instrs(void *drcontext, void *tag, instrlist_t *ilist, user_dat
 
 /* Inserts a conditional branch that jumps to skip_label if reg_skip_if_zero's
  * value is zero.
- * Returns a temp reg that must be passed to insert_conditional_skip_target() at
+ * "*reg_tmp" must start out as DR_REG_NULL. It will hold a temp reg that must be passed
+ * to any subsequent call here as well as to insert_conditional_skip_target() at
  * the point where skip_label should be inserted.  Additionally, the
  * app_regs_at_skip set must be empty prior to calling and it must be passed
  * to insert_conditional_skip_target().
  * reg_skip_if_zero must be DR_REG_XCX on x86.
  */
-static reg_id_t
+static void
 insert_conditional_skip(void *drcontext, instrlist_t *ilist, instr_t *where,
-                        reg_id_t reg_skip_if_zero, instr_t *skip_label,
-                        bool short_reaches, reg_id_set_t &app_regs_at_skip)
+                        reg_id_t reg_skip_if_zero, reg_id_t *reg_tmp INOUT,
+                        instr_t *skip_label, bool short_reaches,
+                        reg_id_set_t &app_regs_at_skip)
 {
-    reg_id_t reg_tmp = DR_REG_NULL;
-
     // Record the registers that will need barriers at the skip target.
     for (reg_id_t reg = DR_REG_START_GPR; reg <= DR_REG_STOP_GPR; ++reg) {
         drreg_reserve_info_t info = { sizeof(info) };
@@ -933,13 +1066,17 @@ insert_conditional_skip(void *drcontext, instrlist_t *ilist, instr_t *where,
         MINSERT(ilist, where, noskip);
     } else {
         /* There is no jecxz/cbz like instr on ARM-A32 mode, so we have to
-         * save aflags to a temp reg before check.
+         * save aflags to a temp reg before the cmp.
          * XXX optimization: use drreg to avoid aflags save/restore.
          */
-        if (drreg_reserve_register(drcontext, ilist, where, &scratch_reserve_vec,
-                                   &reg_tmp) != DRREG_SUCCESS)
-            FATAL("Fatal error: failed to reserve reg.");
-        dr_save_arith_flags_to_reg(drcontext, ilist, where, reg_tmp);
+        if (*reg_tmp != DR_REG_NULL) {
+            /* A prior call has already saved the flags. */
+        } else {
+            if (drreg_reserve_register(drcontext, ilist, where, &scratch_reserve_vec,
+                                       reg_tmp) != DRREG_SUCCESS)
+                FATAL("Fatal error: failed to reserve reg.");
+            dr_save_arith_flags_to_reg(drcontext, ilist, where, *reg_tmp);
+        }
         MINSERT(ilist, where,
                 INSTR_CREATE_cmp(drcontext, opnd_create_reg(reg_skip_if_zero),
                                  OPND_CREATE_INT(0)));
@@ -953,11 +1090,10 @@ insert_conditional_skip(void *drcontext, instrlist_t *ilist, instr_t *where,
             INSTR_CREATE_cbz(drcontext, opnd_create_instr(skip_label),
                              opnd_create_reg(reg_skip_if_zero)));
 #endif
-    return reg_tmp;
 }
 
 /* Should be called at the point where skip_label should be inserted.
- * reg_tmp must be the return value from insert_conditional_skip().
+ * reg_tmp must be the "*reg_tmp" output value from insert_conditional_skip().
  * Inserts a barrier for all app-valued registers at the jump point
  * (stored in app_regs_at_skip), to help avoid problems with different
  * paths having different lazy reg restoring from drreg.
@@ -995,6 +1131,8 @@ insert_conditional_skip_target(void *drcontext, instrlist_t *ilist, instr_t *whe
 
 /* We insert code to read from trace buffer and check whether the redzone
  * is reached. If redzone is reached, the clean call will be called.
+ * Additionally, for tracing windows, we also check for a mode switch and
+ * invoke the clean call if our tracing window is over.
  */
 static void
 instrument_clean_call(void *drcontext, instrlist_t *ilist, instr_t *where,
@@ -1018,26 +1156,98 @@ instrument_clean_call(void *drcontext, instrlist_t *ilist, instr_t *where,
     /* XXX: clean call is too long to use cbz to skip. */
     short_reaches = false;
 #endif
+
+    if (has_tracing_windows()) {
+        // We need to do the clean call if the mode has changed back to counting.  To
+        // detect a double-change we compare the TLS-stored last window to the
+        // current tracing_window.  To avoid flags and avoid another branch (jumping
+        // over the filter and redzone checks, e.g.), our strategy is to arrange for
+        // the redzone load to trigger the call for us if the two window values are
+        // not equal by writing their difference onto the next buffer slot.  This
+        // requires a store and two scratch regs so perhaps this should be measured
+        // against a branch-based scheme, but we assume we're i/o bound and so this will
+        // not affect overhead.
+        reg_id_t reg_mine = DR_REG_NULL, reg_global = DR_REG_NULL;
+        if (drreg_reserve_register(drcontext, ilist, where, NULL, &reg_mine) !=
+                DRREG_SUCCESS ||
+            drreg_reserve_register(drcontext, ilist, where, NULL, &reg_global) !=
+                DRREG_SUCCESS)
+            FATAL("Fatal error: failed to reserve reg.");
+#ifdef AARCHXX
+        instrlist_insert_mov_immed_ptrsz(drcontext, (ptr_int_t)&tracing_window,
+                                         opnd_create_reg(reg_global), ilist, where, NULL,
+                                         NULL);
+#    ifdef AARCH64
+        MINSERT(ilist, where,
+                INSTR_CREATE_ldar(drcontext, opnd_create_reg(reg_global),
+                                  OPND_CREATE_MEMPTR(reg_global, 0)));
+#    else
+        MINSERT(ilist, where,
+                XINST_CREATE_load(drcontext, opnd_create_reg(reg_global),
+                                  OPND_CREATE_MEMPTR(reg_global, 0)));
+        MINSERT(ilist, where, INSTR_CREATE_dmb(drcontext, OPND_CREATE_INT(DR_DMB_ISH)));
+#    endif
+#else
+        MINSERT(ilist, where,
+                XINST_CREATE_load(drcontext, opnd_create_reg(reg_global),
+                                  OPND_CREATE_ABSMEM(&tracing_window, OPSZ_PTR)));
+#endif
+        dr_insert_read_raw_tls(drcontext, ilist, where, tls_seg,
+                               tls_offs + sizeof(void *) * MEMTRACE_TLS_OFFS_WINDOW,
+                               reg_mine);
+#ifdef AARCHXX
+        MINSERT(ilist, where,
+                XINST_CREATE_sub(drcontext, opnd_create_reg(reg_mine),
+                                 opnd_create_reg(reg_global)));
+#else
+        // Our version of a flags-free reg-reg subtraction: 1's complement one reg
+        // plus 1 and then add using base+index of LEA.
+        MINSERT(ilist, where, INSTR_CREATE_not(drcontext, opnd_create_reg(reg_global)));
+        MINSERT(ilist, where,
+                INSTR_CREATE_lea(drcontext, opnd_create_reg(reg_global),
+                                 OPND_CREATE_MEM_lea(reg_global, DR_REG_NULL, 0, 1)));
+        MINSERT(ilist, where,
+                INSTR_CREATE_lea(drcontext, opnd_create_reg(reg_mine),
+                                 OPND_CREATE_MEM_lea(reg_mine, reg_global, 1, 0)));
+#endif
+        // To avoid writing a 0 on top of the redzone, we read the buffer value and add
+        // that to the local ("mine") window minus the global window.  The redzone is
+        // -1, so if we do mine minus global which will always be non-positive, we'll
+        // never write 0 for a redzone slot (and thus possibly overflowing).
+        MINSERT(ilist, where,
+                XINST_CREATE_load(drcontext, opnd_create_reg(reg_global),
+                                  OPND_CREATE_MEMPTR(reg_ptr, 0)));
+        MINSERT(ilist, where,
+                XINST_CREATE_add(drcontext, opnd_create_reg(reg_mine),
+                                 opnd_create_reg(reg_global)));
+        MINSERT(ilist, where,
+                XINST_CREATE_store(drcontext, OPND_CREATE_MEMPTR(reg_ptr, 0),
+                                   opnd_create_reg(reg_mine)));
+        if (drreg_unreserve_register(drcontext, ilist, where, reg_global) !=
+                DRREG_SUCCESS ||
+            drreg_unreserve_register(drcontext, ilist, where, reg_mine) != DRREG_SUCCESS)
+            FATAL("Fatal error: failed to unreserve scratch reg.\n");
+    }
+
+    reg_id_t reg_tmp = DR_REG_NULL;
     instr_t *skip_thread = INSTR_CREATE_label(drcontext);
-    reg_id_t reg_thread = DR_REG_NULL;
     reg_id_set_t app_regs_at_skip_thread;
     if (op_L0_filter.get_value() && thread_filtering_enabled) {
-        reg_thread =
-            insert_conditional_skip(drcontext, ilist, where, reg_ptr, skip_thread,
-                                    short_reaches, app_regs_at_skip_thread);
+        insert_conditional_skip(drcontext, ilist, where, reg_ptr, &reg_tmp, skip_thread,
+                                short_reaches, app_regs_at_skip_thread);
     }
     MINSERT(ilist, where,
             XINST_CREATE_load(drcontext, opnd_create_reg(reg_ptr),
                               OPND_CREATE_MEMPTR(reg_ptr, 0)));
     reg_id_set_t app_regs_at_skip_call;
-    reg_id_t reg_tmp =
-        insert_conditional_skip(drcontext, ilist, where, reg_ptr, skip_call,
-                                short_reaches, app_regs_at_skip_call);
+    insert_conditional_skip(drcontext, ilist, where, reg_ptr, &reg_tmp, skip_call,
+                            short_reaches, app_regs_at_skip_call);
+
     dr_insert_clean_call_ex(drcontext, ilist, where, (void *)clean_call,
                             DR_CLEANCALL_ALWAYS_OUT_OF_LINE, 0);
     insert_conditional_skip_target(drcontext, ilist, where, skip_call, reg_tmp,
                                    app_regs_at_skip_call);
-    insert_conditional_skip_target(drcontext, ilist, where, skip_thread, reg_thread,
+    insert_conditional_skip_target(drcontext, ilist, where, skip_thread, reg_tmp,
                                    app_regs_at_skip_thread);
 }
 
@@ -1419,8 +1629,8 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *inst
                 short_reaches = true;
             }
 #endif
-            reg_skip = insert_conditional_skip(drcontext, bb, where, reg_ptr, skip_instru,
-                                               short_reaches, app_regs_at_skip);
+            insert_conditional_skip(drcontext, bb, where, reg_ptr, &reg_skip, skip_instru,
+                                    short_reaches, app_regs_at_skip);
         }
     }
 
@@ -1668,15 +1878,44 @@ static uint64 instr_count;
  */
 #define DELAY_EXACT_THRESHOLD (10 * 1024 * 1024)
 #define DELAY_COUNTDOWN_UNIT 10000
+// For -trace_for_instrs without -retrace_every_instrs we count forever,
+// but to avoid the complexity of different instrumentation we need a threshold.
+#define DELAY_FOREVER_THRESHOLD (1024 * 1024 * 1024)
+
+std::atomic<bool> reached_trace_after_instrs;
+
+static bool
+has_instr_count_threshold()
+{
+    if (op_trace_after_instrs.get_value() > 0 &&
+        !reached_trace_after_instrs.load(std::memory_order_acquire))
+        return true;
+    if (op_retrace_every_instrs.get_value() > 0)
+        return true;
+    return false;
+}
+
+static uint64
+instr_count_threshold()
+{
+    if (op_trace_after_instrs.get_value() > 0 &&
+        !reached_trace_after_instrs.load(std::memory_order_acquire))
+        return op_trace_after_instrs.get_value();
+    if (op_retrace_every_instrs.get_value() > 0)
+        return op_retrace_every_instrs.get_value();
+    return DELAY_FOREVER_THRESHOLD;
+}
 
 static void
 hit_instr_count_threshold(app_pc next_pc)
 {
+    if (!has_instr_count_threshold())
+        return;
 #ifdef DELAYED_CHECK_INLINED
     /* XXX: We could do the same thread-local counters for non-inlined.
      * We'd then switch to std::atomic or something for 32-bit.
      */
-    if (op_trace_after_instrs.get_value() > DELAY_EXACT_THRESHOLD) {
+    if (instr_count_threshold() > DELAY_EXACT_THRESHOLD) {
         void *drcontext = dr_get_current_drcontext();
         per_thread_t *data = (per_thread_t *)drmgr_get_tls_field(drcontext, tls_idx);
         int64 myval = *(int64 *)TLS_SLOT(data->seg_base, MEMTRACE_TLS_OFFS_ICOUNTDOWN);
@@ -1684,24 +1923,48 @@ hit_instr_count_threshold(app_pc next_pc)
                                                            DELAY_COUNTDOWN_UNIT - myval);
         *(uintptr_t *)TLS_SLOT(data->seg_base, MEMTRACE_TLS_OFFS_ICOUNTDOWN) =
             DELAY_COUNTDOWN_UNIT;
-        if (newval < op_trace_after_instrs.get_value())
+        if (newval < instr_count_threshold())
             return;
     }
 #endif
-    NOTIFY(0, "Hit delay threshold: enabling tracing.\n");
+    dr_mutex_lock(mutex);
+    if (tracing_disabled.load(std::memory_order_acquire) == BBDUP_MODE_TRACE) {
+        // Another thread already changed the mode.
+        dr_mutex_unlock(mutex);
+        return;
+    }
+    if (op_trace_after_instrs.get_value() > 0 &&
+        !reached_trace_after_instrs.load(std::memory_order_acquire))
+        NOTIFY(0, "Hit delay threshold: enabling tracing.\n");
+    else
+        NOTIFY(0, "Hit retrace threshold: enabling tracing.\n");
+    if (!reached_trace_after_instrs.load(std::memory_order_acquire)) {
+        reached_trace_after_instrs.store(true, std::memory_order_release);
+    }
+    // Reset for -retrace_every_instrs.
+#ifdef X64
+    dr_atomic_store64((volatile int64 *)&instr_count, 0);
+#else
+    // dr_atomic_store64 is not implemented for 32-bit, and it's technically not
+    // portably safe to take the address of std::atomic, so we rely on our mutex.
+    instr_count = 0;
+#endif
     DR_ASSERT(tracing_disabled.load(std::memory_order_acquire) == BBDUP_MODE_COUNT);
     tracing_disabled.store(BBDUP_MODE_TRACE, std::memory_order_release);
+    dr_mutex_unlock(mutex);
 }
 
 #ifndef DELAYED_CHECK_INLINED
 static void
 check_instr_count_threshold(uint incby, app_pc next_pc)
 {
+    if (!has_instr_count_threshold())
+        return;
     /* XXX i#5030: This is racy.  We could make std::atomic, or, better, go and
      * implement the inlining and i#5026's thread-private counting.
      */
     instr_count += incby;
-    if (instr_count > op_trace_after_instrs.get_value())
+    if (instr_count > instr_count_threshold())
         hit_instr_count_threshold(next_pc);
 }
 #endif
@@ -1744,7 +2007,7 @@ event_inscount_app_instruction(void *drcontext, void *tag, instrlist_t *bb,
     instr_t *skip_call = INSTR_CREATE_label(drcontext);
 #        ifdef X86_64
     reg_id_t scratch = DR_REG_NULL;
-    if (op_trace_after_instrs.get_value() > DELAY_EXACT_THRESHOLD) {
+    if (instr_count_threshold() > DELAY_EXACT_THRESHOLD) {
         /* Contention on a global counter causes high overheads.  We approximate the
          * count by using thread-local counters and only merging into the global
          * every so often.
@@ -1768,16 +2031,15 @@ event_inscount_app_instruction(void *drcontext, void *tag, instrlist_t *bb,
 
         if (drreg_reserve_aflags(drcontext, bb, where) != DRREG_SUCCESS)
             FATAL("Fatal error: failed to reserve aflags");
-        if (op_trace_after_instrs.get_value() < INT_MAX) {
-            MINSERT(
-                bb, where,
-                XINST_CREATE_cmp(drcontext, OPND_CREATE_ABSMEM(&instr_count, OPSZ_8),
-                                 OPND_CREATE_INT32(op_trace_after_instrs.get_value())));
+        if (instr_count_threshold() < INT_MAX) {
+            MINSERT(bb, where,
+                    XINST_CREATE_cmp(drcontext, OPND_CREATE_ABSMEM(&instr_count, OPSZ_8),
+                                     OPND_CREATE_INT32(instr_count_threshold())));
         } else {
             if (drreg_reserve_register(drcontext, bb, where, NULL, &scratch) !=
                 DRREG_SUCCESS)
                 FATAL("Fatal error: failed to reserve scratch register");
-            instrlist_insert_mov_immed_ptrsz(drcontext, op_trace_after_instrs.get_value(),
+            instrlist_insert_mov_immed_ptrsz(drcontext, instr_count_threshold(),
                                              opnd_create_reg(scratch), bb, where, NULL,
                                              NULL);
             MINSERT(bb, where,
@@ -1789,7 +2051,7 @@ event_inscount_app_instruction(void *drcontext, void *tag, instrlist_t *bb,
     }
 #        elif defined(AARCH64)
     reg_id_t scratch1, scratch2 = DR_REG_NULL;
-    if (op_trace_after_instrs.get_value() > DELAY_EXACT_THRESHOLD) {
+    if (instr_count_threshold() > DELAY_EXACT_THRESHOLD) {
         /* See the x86_64 comment on using thread-local counters to avoid contention. */
         if (drreg_reserve_register(drcontext, bb, where, NULL, &scratch1) !=
             DRREG_SUCCESS)
@@ -1901,6 +2163,12 @@ init_thread_in_process(void *drcontext)
         IF_X86_ELSE(
             IF_X64_ELSE(OFFLINE_FILE_TYPE_ARCH_X86_64, OFFLINE_FILE_TYPE_ARCH_X86_32),
             IF_X64_ELSE(OFFLINE_FILE_TYPE_ARCH_AARCH64, OFFLINE_FILE_TYPE_ARCH_ARM32)));
+
+    if (has_tracing_windows())
+        set_local_window(data, tracing_window.load(std::memory_order_acquire));
+    else
+        set_local_window(data, -1);
+
     if (op_offline.get_value()) {
         /* We do not need to call drx_init before using drx_open_unique_appid_file.
          * Since we're now in a subdir we could make the name simpler but this
@@ -1940,7 +2208,7 @@ init_thread_in_process(void *drcontext)
         proc_info += reinterpret_cast<online_instru_t *>(instru)->append_thread_header(
             proc_info, dr_get_thread_id(drcontext), file_type);
         DR_ASSERT(BUFFER_SIZE_BYTES(buf) >= (size_t)(proc_info - (byte *)buf));
-        write_trace_data(drcontext, (byte *)buf, proc_info);
+        write_trace_data(drcontext, (byte *)buf, proc_info, get_local_window(data));
 
         /* put buf_base to TLS plus header slots as starting buf_ptr */
         data->init_header_size = buf_hdr_slots_size;
@@ -2237,6 +2505,10 @@ drmemtrace_client_main(client_id_t id, int argc, const char *argv[])
         FATAL("Usage error: L0I_size and L0D_size must be 0 or powers of 2.");
     }
 
+    DR_ASSERT(std::atomic_is_lock_free(&reached_trace_after_instrs));
+    DR_ASSERT(std::atomic_is_lock_free(&tracing_disabled));
+    DR_ASSERT(std::atomic_is_lock_free(&tracing_window));
+
     drreg_init_and_fill_vector(&scratch_reserve_vec, true);
 #ifdef X86
     if (op_L0_filter.get_value()) {
@@ -2344,8 +2616,8 @@ drmemtrace_client_main(client_id_t id, int argc, const char *argv[])
     /* Mark any padding as redzone as well */
     redzone_size = max_buf_size - trace_buf_size;
     /* Append a throwaway header to get its size. */
-    buf_hdr_slots_size =
-        append_unit_header(NULL /*no TLS yet*/, buf, 0 /*doesn't matter*/);
+    buf_hdr_slots_size = append_unit_header(
+        NULL /*no TLS yet*/, buf, 0 /*doesn't matter*/, has_tracing_windows() ? 1 : -1);
     DR_ASSERT(BUFFER_SIZE_BYTES(buf) >= buf_hdr_slots_size);
 
     client_id = id;
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index 727d60cb287..feb3bf7f9ed 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -3345,6 +3345,9 @@ if (BUILD_CLIENTS)
       "-simulator_type basic_counts -trace_after_instrs 20K -max_global_trace_refs 10K"
       "${annotation_test_args_shorter}")
 
+    torunonly_drcachesim(windows-simple ${ci_shared_app}
+      "-trace_after_instrs 20K -trace_for_instrs 5K -retrace_every_instrs 35K -simulator_type basic_counts" "")
+
     # Test that "Warmup hits" and "Warmup misses" are printed out
     torunonly_drcachesim(warmup-valid ${ci_shared_app} "-warmup_refs 1" "")
 
@@ -3603,6 +3606,21 @@ if (BUILD_CLIENTS)
       "-trace_after_instrs 200M -record_heap"
       "@-simulator_type@basic_counts" "")
 
+    torunonly_drcacheoff(windows-simple ${ci_shared_app}
+      "-trace_after_instrs 20K -trace_for_instrs 5K -retrace_every_instrs 35K"
+      "@-simulator_type@basic_counts" "")
+    if (NOT WINDOWS) # TODO i#5390: Fix drbbdup hiding block-final jmps
+      # Ensure the invariant checker handles window transitions.
+      torunonly_drcacheoff(windows-invar ${ci_shared_app}
+        "-trace_after_instrs 20K -trace_for_instrs 5K -retrace_every_instrs 35K"
+        "@-simulator_type@invariant_checker" "")
+    endif ()
+    if (X86 AND X64 AND UNIX)
+      torunonly_drcacheoff(windows-asm allasm_repstr
+        "-trace_after_instrs 3 -trace_for_instrs 4 -retrace_every_instrs 4"
+        "@-simulator_type@basic_counts" "")
+    endif ()
+
     # __builtin_prefetch used in the test is not defined on MSVC.
     if (NOT MSVC)
       torunonly_drcacheoff(builtin-prefetch-basic-counts builtin_prefetch "" "@-simulator_type@basic_counts" "")
diff --git a/suite/tests/samples/memtrace_simple_repstr.templatex b/suite/tests/samples/memtrace_simple_repstr.templatex
index 1869b9b3b21..c3c4dacc8c8 100644
--- a/suite/tests/samples/memtrace_simple_repstr.templatex
+++ b/suite/tests/samples/memtrace_simple_repstr.templatex
@@ -17,3 +17,12 @@ Format: <data address>: <data size>, <\(r\)ead/\(w\)rite/opcode>
 0x[0-9a-f]*:  1, r
 0x[0-9a-f]*:  1, w
 Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!
+Adios world!