diff --git a/clients/drcachesim/CMakeLists.txt b/clients/drcachesim/CMakeLists.txt
index 3db2a9a9fbd..ac11156ed7f 100644
--- a/clients/drcachesim/CMakeLists.txt
+++ b/clients/drcachesim/CMakeLists.txt
@@ -74,6 +74,7 @@ if (ZLIB_FOUND)
       "until then, disabling zip output and fast seeking")
     set(zip_reader "")
     set(zlib_libs ${ZLIB_LIBRARIES})
+    set(ZIP_FOUND OFF)
   else ()
     file(GLOB minizip_srcs "${minizip_dir}/*.c")
     if (NOT WIN32)
@@ -81,6 +82,7 @@ if (ZLIB_FOUND)
     endif ()
     add_library(minizip STATIC ${minizip_srcs})
     add_definitions(-DHAS_ZIP)
+    set(ZIP_FOUND ON)
     # We add "minizip/" to avoid collisions with system "zip.h" on Mac.
     include_directories(${minizip_dir}/..)
     DR_export_target(minizip)
@@ -768,6 +770,21 @@ if (BUILD_TESTS)
     use_DynamoRIO_extension(tool.drcacheoff.burst_traceopts drcovlib_static)
 
   endif ()
+
+  if (X86 AND X64 AND ZIP_FOUND)
+    # XXX i#5538: Add trace files for other arches.
+    set(zip_path
+      "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.allasm_x86_64.trace.zip")
+    add_executable(tool.drcacheoff.skip_unit_tests tests/skip_unit_tests.cpp)
+    configure_DynamoRIO_standalone(tool.drcacheoff.skip_unit_tests)
+    target_link_libraries(tool.drcacheoff.skip_unit_tests drmemtrace_analyzer
+      drmemtrace_view drmemtrace_raw2trace)
+    use_DynamoRIO_extension(tool.drcacheoff.skip_unit_tests drreg_static)
+    use_DynamoRIO_extension(tool.drcacheoff.skip_unit_tests drcovlib_static)
+    use_DynamoRIO_extension(tool.drcacheoff.skip_unit_tests drdecode)
+    add_test(NAME tool.drcacheoff.skip_unit_tests
+             COMMAND tool.drcacheoff.skip_unit_tests --trace_file ${zip_path})
+  endif ()
 endif ()
 
 ##################################################
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index 0a79ae7a9c4..56708300137 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -176,12 +176,13 @@ analyzer_t::init_file_reader(const std::string &trace_path, int verbosity)
 }
 
 analyzer_t::analyzer_t(const std::string &trace_path, analysis_tool_t **tools,
-                       int num_tools, int worker_count)
+                       int num_tools, int worker_count, uint64_t skip_instrs)
     : success_(true)
     , num_tools_(num_tools)
     , tools_(tools)
     , parallel_(true)
     , worker_count_(worker_count)
+    , skip_instrs_(skip_instrs)
 {
     if (!init_file_reader(trace_path)) {
         success_ = false;
@@ -273,9 +274,17 @@ analyzer_t::process_tasks(std::vector<analyzer_shard_data_t *> *tasks)
                 tdata->index, worker_data[i], tdata->iter.get());
         }
         VPRINT(this, 1, "shard_data[0] is %p\n", shard_data[0]);
+        if (skip_instrs_ > 0) {
+            // We skip in each thread.
+            // TODO i#5538: Add top-level header data to memtrace_stream_t for
+            // access by tools, since we're skipping it here.  We considered
+            // not skipping until we see the 1st timestamp but the stream access
+            // approach has other benefits and seems cleaner.
+            (*tdata->iter) = (*tdata->iter).skip_instructions(skip_instrs_);
+        }
         for (; *tdata->iter != *trace_end_; ++(*tdata->iter)) {
+            const memref_t &memref = **tdata->iter;
             for (int i = 0; i < num_tools_; ++i) {
-                const memref_t &memref = **tdata->iter;
                 if (!tools_[i]->parallel_shard_memref(shard_data[i], memref)) {
                     tdata->error = tools_[i]->parallel_shard_error(shard_data[i]);
                     VPRINT(this, 1,
@@ -314,9 +323,13 @@ analyzer_t::run()
     if (!parallel_) {
         if (!start_reading())
             return false;
+        if (skip_instrs_ > 0) {
+            // TODO i#5538: Add top-level header data to memtrace_stream_t; see above.
+            (*serial_trace_iter_) = (*serial_trace_iter_).skip_instructions(skip_instrs_);
+        }
         for (; *serial_trace_iter_ != *trace_end_; ++(*serial_trace_iter_)) {
+            const memref_t &memref = **serial_trace_iter_;
             for (int i = 0; i < num_tools_; ++i) {
-                memref_t memref = **serial_trace_iter_;
                 // We short-circuit and exit on an error to avoid confusion over
                 // the results and avoid wasted continued work.
                 if (!tools_[i]->process_memref(memref)) {
diff --git a/clients/drcachesim/analyzer.h b/clients/drcachesim/analyzer.h
index 8e13f8d9996..48a177dcb49 100644
--- a/clients/drcachesim/analyzer.h
+++ b/clients/drcachesim/analyzer.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2016-2020 Google, Inc.  All rights reserved.
+ * Copyright (c) 2016-2022 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -66,7 +66,8 @@ class analyzer_t {
     analyzer_t();
     virtual ~analyzer_t(); /**< Destructor. */
     /** Returns whether the analyzer was created successfully. */
-    virtual bool operator!();
+    virtual bool
+    operator!();
     /** Returns further information on an error in initializing the analyzer. */
     virtual std::string
     get_error_string();
@@ -83,7 +84,7 @@ class analyzer_t {
      * The analyzer calls the initialize() function on each tool before use.
      */
     analyzer_t(const std::string &trace_path, analysis_tool_t **tools, int num_tools,
-               int worker_count = 0);
+               int worker_count = 0, uint64_t skip_instrs = 0);
     /** Launches the analysis process. */
     virtual bool
     run();
@@ -164,6 +165,7 @@ class analyzer_t {
     std::vector<std::vector<analyzer_shard_data_t *>> worker_tasks_;
     int verbosity_ = 0;
     const char *output_prefix_ = "[analyzer]";
+    uint64_t skip_instrs_ = 0;
 };
 
 #endif /* _ANALYZER_H_ */
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index 2ffc82f5c56..1a022324f5b 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -52,6 +52,7 @@
 analyzer_multi_t::analyzer_multi_t()
 {
     worker_count_ = op_jobs.get_value();
+    skip_instrs_ = op_skip_instrs.get_value();
     // Initial measurements show it's sometimes faster to keep the parallel model
     // of using single-file readers but use them sequentially, as opposed to
     // the every-file interleaving reader, but the user can specify -jobs 1, so
diff --git a/clients/drcachesim/common/memtrace_stream.h b/clients/drcachesim/common/memtrace_stream.h
index e25bba70d40..6acf8757c16 100644
--- a/clients/drcachesim/common/memtrace_stream.h
+++ b/clients/drcachesim/common/memtrace_stream.h
@@ -37,7 +37,7 @@
  * the record and instruction ordinals within the stream, in the presence of
  * skipping: we could add fields to memref but we'd either have to append
  * and have them at different offsets for each type or we'd have to break
- * compatbility to prepend every time we added more; or we could add parameters
+ * compatibility to prepend every time we added more; or we could add parameters
  * to process_memref().  Passing an interface to the init routines seems
  * the simplest and most flexible.
  */
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index aae90caf881..0745c3c7e5c 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -487,12 +487,22 @@ droption_t<int>
                    "For simulator types that support it, limits analyis to the single "
                    "thread with the given identifier.  0 enables all threads.");
 
+droption_t<bytesize_t> op_skip_instrs(
+    DROPTION_SCOPE_FRONTEND, "skip_instrs", 0, "Number of instructions to skip",
+    "Specifies the number of instructions to skip in the beginning of the trace "
+    "analysis.  For serial iteration, this number is "
+    "computed just once across the interleaving sequence of all threads; for parallel "
+    "iteration, each thread skips this many insructions.  When built with zipfile "
+    "support, this skipping is optimized and large instruction counts can be quickly "
+    "skipped; this is not the case for -skip_refs.");
+
 droption_t<bytesize_t>
     op_skip_refs(DROPTION_SCOPE_FRONTEND, "skip_refs", 0,
                  "Number of memory references to skip",
-                 "Specifies the number of references to skip "
-                 "in the beginning of the application execution. "
-                 "These memory references are dropped instead of being simulated.");
+                 "Specifies the number of references to skip in the beginning of the "
+                 "application execution. These memory references are dropped instead "
+                 "of being simulated.  This skipping may be slow for large skip values; "
+                 "consider -skip_instrs for a faster method of skipping.");
 
 droption_t<bytesize_t> op_warmup_refs(
     DROPTION_SCOPE_FRONTEND, "warmup_refs", 0,
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index 0b137341368..5e95e79bff1 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -124,6 +124,7 @@ extern droption_t<std::string> op_tracer;
 extern droption_t<std::string> op_tracer_alt;
 extern droption_t<std::string> op_tracer_ops;
 extern droption_t<int> op_only_thread;
+extern droption_t<bytesize_t> op_skip_instrs;
 extern droption_t<bytesize_t> op_skip_refs;
 extern droption_t<bytesize_t> op_warmup_refs;
 extern droption_t<double> op_warmup_fraction;
diff --git a/clients/drcachesim/reader/file_reader.h b/clients/drcachesim/reader/file_reader.h
index cd3023617b0..b7a3c4cd169 100644
--- a/clients/drcachesim/reader/file_reader.h
+++ b/clients/drcachesim/reader/file_reader.h
@@ -38,6 +38,7 @@
 #ifndef _FILE_READER_H_
 #define _FILE_READER_H_ 1
 
+#include <inttypes.h>
 #include <string.h>
 #include <fstream>
 #include <queue>
@@ -55,11 +56,7 @@
 #    ifdef WINDOWS
 #        define ZHEX64_FORMAT_STRING "%016I64x"
 #    else
-#        if defined(__i386__) || defined(__arm__) || defined(__APPLE__)
-#            define ZHEX64_FORMAT_STRING "%016llx"
-#        else
-#            define ZHEX64_FORMAT_STRING "%016lx"
-#        endif
+#        define ZHEX64_FORMAT_STRING "%" PRIx64
 #    endif
 #endif
 
@@ -100,6 +97,35 @@ template <typename T> class file_reader_t : public reader_t {
     virtual bool
     is_complete();
 
+    reader_t &
+    skip_instructions(uint64_t instruction_count) override
+    {
+        if (input_files_.size() > 1) {
+            // TODO i#5538: For fast thread-interleaved (whether serial here or the
+            // forthcoming per-cpu iteration) we need to read in the schedule file(s)
+            // that raw2trace writes out so that we can compute how far to separately
+            // fast-skip in each interleaved thread by calling the per-thread version.
+            // We'll also need to update the memref pid+tid state since we're not
+            // repeating top headers in every thread after a skip.  For now this is a
+            // slow linear walk.
+            return reader_t::skip_instructions(instruction_count);
+        }
+        // If the user asks to skip from the very start, we still need to find the chunk
+        // count marker and drain the header queue.
+        // TODO i#5538: Record all of the header values until the first timestamp
+        // and present them as new memtrace_stream_t interfaces.
+        while (chunk_instr_count_ == 0) {
+            input_entry_ = read_next_entry();
+            process_input_entry();
+        }
+        if (!queues_[0].empty())
+            ERRMSG("Failed to drain header queue\n");
+        bool eof = false;
+        if (!skip_thread_instructions(0, instruction_count, &eof) || eof)
+            at_eof_ = true;
+        return *this;
+    }
+
 protected:
     bool
     read_next_thread_entry(size_t thread_index, OUT trace_entry_t *entry,
@@ -297,6 +323,25 @@ template <typename T> class file_reader_t : public reader_t {
         return nullptr;
     }
 
+    virtual bool
+    skip_thread_instructions(size_t thread_index, uint64_t instruction_count,
+                             OUT bool *eof)
+    {
+        // Default implementation for file types that have no fast seeking and must do a
+        // linear walk.
+        uint64_t stop_count_ = cur_instr_count_ + instruction_count + 1;
+        while (cur_instr_count_ < stop_count_) {
+            if (!read_next_thread_entry(thread_index, &entry_copy_, eof))
+                return false;
+            // Update core state.
+            input_entry_ = &entry_copy_;
+            process_input_entry();
+            // TODO i#5538: Remember the last timestamp+cpu and insert it; share
+            // code with the zipfile reader.
+        }
+        return true;
+    }
+
 private:
     std::string input_path_;
     std::vector<std::string> input_path_list_;
diff --git a/clients/drcachesim/reader/reader.cpp b/clients/drcachesim/reader/reader.cpp
index 26f9852fa70..da6587f271a 100644
--- a/clients/drcachesim/reader/reader.cpp
+++ b/clients/drcachesim/reader/reader.cpp
@@ -67,231 +67,251 @@ reader_t::operator++()
         }
         VPRINT(this, 4, "RECV: type=%d, size=%d, addr=0x%zx\n", input_entry_->type,
                input_entry_->size, input_entry_->addr);
-        bool have_memref = false;
-        switch (input_entry_->type) {
-        case TRACE_TYPE_READ:
-        case TRACE_TYPE_WRITE:
-        case TRACE_TYPE_PREFETCH:
-        case TRACE_TYPE_PREFETCH_READ_L1:
-        case TRACE_TYPE_PREFETCH_READ_L2:
-        case TRACE_TYPE_PREFETCH_READ_L3:
-        case TRACE_TYPE_PREFETCHNTA:
-        case TRACE_TYPE_PREFETCH_READ:
-        case TRACE_TYPE_PREFETCH_WRITE:
-        case TRACE_TYPE_PREFETCH_INSTR:
-        case TRACE_TYPE_PREFETCH_READ_L1_NT:
-        case TRACE_TYPE_PREFETCH_READ_L2_NT:
-        case TRACE_TYPE_PREFETCH_READ_L3_NT:
-        case TRACE_TYPE_PREFETCH_INSTR_L1:
-        case TRACE_TYPE_PREFETCH_INSTR_L1_NT:
-        case TRACE_TYPE_PREFETCH_INSTR_L2:
-        case TRACE_TYPE_PREFETCH_INSTR_L2_NT:
-        case TRACE_TYPE_PREFETCH_INSTR_L3:
-        case TRACE_TYPE_PREFETCH_INSTR_L3_NT:
-        case TRACE_TYPE_PREFETCH_WRITE_L1:
-        case TRACE_TYPE_PREFETCH_WRITE_L1_NT:
-        case TRACE_TYPE_PREFETCH_WRITE_L2:
-        case TRACE_TYPE_PREFETCH_WRITE_L2_NT:
-        case TRACE_TYPE_PREFETCH_WRITE_L3:
-        case TRACE_TYPE_PREFETCH_WRITE_L3_NT:
-            have_memref = true;
-            assert(cur_tid_ != 0 && cur_pid_ != 0);
-            cur_ref_.data.pid = cur_pid_;
-            cur_ref_.data.tid = cur_tid_;
-            cur_ref_.data.type = (trace_type_t)input_entry_->type;
-            cur_ref_.data.size = input_entry_->size;
-            cur_ref_.data.addr = input_entry_->addr;
-            // The trace stream always has the instr fetch first, which we
-            // use to obtain the PC for subsequent data references.
-            cur_ref_.data.pc = cur_pc_;
+        if (process_input_entry())
             break;
-        case TRACE_TYPE_ENCODING:
-            if (last_encoding_.size + input_entry_->size > MAX_ENCODING_LENGTH) {
-                ERRMSG("Invalid too-large encoding size %zu + %d\n", last_encoding_.size,
-                       input_entry_->size);
-                assert(false);
-                at_eof_ = true;
-                break;
-            }
-            memcpy(last_encoding_.bits + last_encoding_.size, input_entry_->encoding,
+    }
+
+    return *this;
+}
+
+bool
+reader_t::process_input_entry()
+{
+    bool have_memref = false;
+    switch (input_entry_->type) {
+    case TRACE_TYPE_READ:
+    case TRACE_TYPE_WRITE:
+    case TRACE_TYPE_PREFETCH:
+    case TRACE_TYPE_PREFETCH_READ_L1:
+    case TRACE_TYPE_PREFETCH_READ_L2:
+    case TRACE_TYPE_PREFETCH_READ_L3:
+    case TRACE_TYPE_PREFETCHNTA:
+    case TRACE_TYPE_PREFETCH_READ:
+    case TRACE_TYPE_PREFETCH_WRITE:
+    case TRACE_TYPE_PREFETCH_INSTR:
+    case TRACE_TYPE_PREFETCH_READ_L1_NT:
+    case TRACE_TYPE_PREFETCH_READ_L2_NT:
+    case TRACE_TYPE_PREFETCH_READ_L3_NT:
+    case TRACE_TYPE_PREFETCH_INSTR_L1:
+    case TRACE_TYPE_PREFETCH_INSTR_L1_NT:
+    case TRACE_TYPE_PREFETCH_INSTR_L2:
+    case TRACE_TYPE_PREFETCH_INSTR_L2_NT:
+    case TRACE_TYPE_PREFETCH_INSTR_L3:
+    case TRACE_TYPE_PREFETCH_INSTR_L3_NT:
+    case TRACE_TYPE_PREFETCH_WRITE_L1:
+    case TRACE_TYPE_PREFETCH_WRITE_L1_NT:
+    case TRACE_TYPE_PREFETCH_WRITE_L2:
+    case TRACE_TYPE_PREFETCH_WRITE_L2_NT:
+    case TRACE_TYPE_PREFETCH_WRITE_L3:
+    case TRACE_TYPE_PREFETCH_WRITE_L3_NT:
+        have_memref = true;
+        assert(cur_tid_ != 0 && cur_pid_ != 0);
+        cur_ref_.data.pid = cur_pid_;
+        cur_ref_.data.tid = cur_tid_;
+        cur_ref_.data.type = (trace_type_t)input_entry_->type;
+        cur_ref_.data.size = input_entry_->size;
+        cur_ref_.data.addr = input_entry_->addr;
+        // The trace stream always has the instr fetch first, which we
+        // use to obtain the PC for subsequent data references.
+        cur_ref_.data.pc = cur_pc_;
+        break;
+    case TRACE_TYPE_ENCODING:
+        if (last_encoding_.size + input_entry_->size > MAX_ENCODING_LENGTH) {
+            ERRMSG("Invalid too-large encoding size %zu + %d\n", last_encoding_.size,
                    input_entry_->size);
-            last_encoding_.size += input_entry_->size;
-            break;
-        case TRACE_TYPE_INSTR_MAYBE_FETCH:
-            // While offline traces can convert rep string per-iter instrs into
-            // no-fetch entries, online can't w/o extra work, so we do the work
-            // here:
-            if (prev_instr_addr_ == input_entry_->addr)
-                input_entry_->type = TRACE_TYPE_INSTR_NO_FETCH;
-            else
-                input_entry_->type = TRACE_TYPE_INSTR;
-            ANNOTATE_FALLTHROUGH;
-        case TRACE_TYPE_INSTR:
-        case TRACE_TYPE_INSTR_DIRECT_JUMP:
-        case TRACE_TYPE_INSTR_INDIRECT_JUMP:
-        case TRACE_TYPE_INSTR_CONDITIONAL_JUMP:
-        case TRACE_TYPE_INSTR_DIRECT_CALL:
-        case TRACE_TYPE_INSTR_INDIRECT_CALL:
-        case TRACE_TYPE_INSTR_RETURN:
-        case TRACE_TYPE_INSTR_SYSENTER:
-        case TRACE_TYPE_INSTR_NO_FETCH:
-            assert(cur_tid_ != 0 && cur_pid_ != 0);
-            if (input_entry_->size == 0) {
-                // Just an entry to tell us the PC of the subsequent memref,
-                // used with -L0_filter where we don't reliably have icache
-                // entries prior to data entries.
-                cur_pc_ = input_entry_->addr;
-            } else {
-                have_memref = true;
-                cur_ref_.instr.pid = cur_pid_;
-                cur_ref_.instr.tid = cur_tid_;
-                cur_ref_.instr.type = (trace_type_t)input_entry_->type;
-                cur_ref_.instr.size = input_entry_->size;
-                cur_pc_ = input_entry_->addr;
-                cur_ref_.instr.addr = cur_pc_;
-                next_pc_ = cur_pc_ + cur_ref_.instr.size;
-                prev_instr_addr_ = input_entry_->addr;
-                if (cur_ref_.instr.type != TRACE_TYPE_INSTR_NO_FETCH)
-                    ++cur_instr_count_;
-                // Look for encoding bits that belong to this instr.
-                if (last_encoding_.size > 0) {
-                    if (last_encoding_.size != cur_ref_.instr.size) {
-                        ERRMSG("Encoding size %zu != instr size %zu\n",
-                               last_encoding_.size, cur_ref_.instr.size);
-                        assert(false);
-                    }
-                    memcpy(cur_ref_.instr.encoding, last_encoding_.bits,
-                           last_encoding_.size);
-                    cur_ref_.instr.encoding_is_new = true;
-                    encodings_[cur_ref_.instr.addr] = last_encoding_;
-                } else {
-                    cur_ref_.instr.encoding_is_new = false;
-                    const auto &it = encodings_.find(cur_ref_.instr.addr);
-                    if (it != encodings_.end()) {
-                        memcpy(cur_ref_.instr.encoding, it->second.bits, it->second.size);
-                    } else if (!expect_no_encodings_) {
-                        ERRMSG("Missing encoding for 0x%zx\n", cur_ref_.instr.addr);
-                        assert(false);
-                    }
-                }
-                last_encoding_.size = 0;
-            }
+            assert(false);
+            at_eof_ = true;
             break;
-        case TRACE_TYPE_INSTR_BUNDLE:
+        }
+        memcpy(last_encoding_.bits + last_encoding_.size, input_entry_->encoding,
+               input_entry_->size);
+        last_encoding_.size += input_entry_->size;
+        break;
+    case TRACE_TYPE_INSTR_MAYBE_FETCH:
+        // While offline traces can convert rep string per-iter instrs into
+        // no-fetch entries, online can't w/o extra work, so we do the work
+        // here:
+        if (prev_instr_addr_ == input_entry_->addr)
+            input_entry_->type = TRACE_TYPE_INSTR_NO_FETCH;
+        else
+            input_entry_->type = TRACE_TYPE_INSTR;
+        ANNOTATE_FALLTHROUGH;
+    case TRACE_TYPE_INSTR:
+    case TRACE_TYPE_INSTR_DIRECT_JUMP:
+    case TRACE_TYPE_INSTR_INDIRECT_JUMP:
+    case TRACE_TYPE_INSTR_CONDITIONAL_JUMP:
+    case TRACE_TYPE_INSTR_DIRECT_CALL:
+    case TRACE_TYPE_INSTR_INDIRECT_CALL:
+    case TRACE_TYPE_INSTR_RETURN:
+    case TRACE_TYPE_INSTR_SYSENTER:
+    case TRACE_TYPE_INSTR_NO_FETCH:
+        assert(cur_tid_ != 0 && cur_pid_ != 0);
+        if (input_entry_->size == 0) {
+            // Just an entry to tell us the PC of the subsequent memref,
+            // used with -L0_filter where we don't reliably have icache
+            // entries prior to data entries.
+            cur_pc_ = input_entry_->addr;
+        } else {
             have_memref = true;
-            // The trace stream always has the instr fetch first, which we
-            // use to compute the starting PC for the subsequent instructions.
-            if (!(type_is_instr(cur_ref_.instr.type) ||
-                  cur_ref_.instr.type == TRACE_TYPE_INSTR_NO_FETCH)) {
-                // XXX i#3320: Diagnostics to track down the elusive remaining case of
-                // this assert on Appveyor.  We'll remove and replace with just the
-                // assert once we have a fix.
-                ERRMSG("Invalid trace entry type %d before a bundle\n",
-                       cur_ref_.instr.type);
-                assert(type_is_instr(cur_ref_.instr.type) ||
-                       cur_ref_.instr.type == TRACE_TYPE_INSTR_NO_FETCH);
-            }
-            cur_ref_.instr.size = input_entry_->length[bundle_idx_++];
-            cur_pc_ = next_pc_;
+            cur_ref_.instr.pid = cur_pid_;
+            cur_ref_.instr.tid = cur_tid_;
+            cur_ref_.instr.type = (trace_type_t)input_entry_->type;
+            cur_ref_.instr.size = input_entry_->size;
+            cur_pc_ = input_entry_->addr;
             cur_ref_.instr.addr = cur_pc_;
             next_pc_ = cur_pc_ + cur_ref_.instr.size;
-            // input_entry_->size stores the number of instrs in this bundle
-            assert(input_entry_->size <= sizeof(input_entry_->length));
-            if (bundle_idx_ == input_entry_->size)
-                bundle_idx_ = 0;
-            break;
-        case TRACE_TYPE_INSTR_FLUSH:
-        case TRACE_TYPE_DATA_FLUSH:
-            assert(cur_tid_ != 0 && cur_pid_ != 0);
-            cur_ref_.flush.pid = cur_pid_;
-            cur_ref_.flush.tid = cur_tid_;
-            cur_ref_.flush.type = (trace_type_t)input_entry_->type;
-            cur_ref_.flush.size = input_entry_->size;
-            cur_ref_.flush.addr = input_entry_->addr;
-            if (cur_ref_.flush.size != 0)
-                have_memref = true;
-            break;
-        case TRACE_TYPE_INSTR_FLUSH_END:
-        case TRACE_TYPE_DATA_FLUSH_END:
-            cur_ref_.flush.size = input_entry_->addr - cur_ref_.flush.addr;
-            have_memref = true;
-            break;
-        case TRACE_TYPE_THREAD:
-            cur_tid_ = (memref_tid_t)input_entry_->addr;
-            // tid2pid might not be filled in yet: if so, we expect a
-            // TRACE_TYPE_PID entry right after this one, and later asserts
-            // will complain if it wasn't there.
-            cur_pid_ = tid2pid_[cur_tid_];
-            break;
-        case TRACE_TYPE_THREAD_EXIT:
-            cur_tid_ = (memref_tid_t)input_entry_->addr;
-            cur_pid_ = tid2pid_[cur_tid_];
-            assert(cur_tid_ != 0 && cur_pid_ != 0);
-            // We do pass this to the caller but only some fields are valid:
-            cur_ref_.exit.pid = cur_pid_;
-            cur_ref_.exit.tid = cur_tid_;
-            cur_ref_.exit.type = (trace_type_t)input_entry_->type;
-            have_memref = true;
-            break;
-        case TRACE_TYPE_PID:
-            cur_pid_ = (memref_pid_t)input_entry_->addr;
-            // We do want to replace, in case of tid reuse.
-            tid2pid_[cur_tid_] = cur_pid_;
-            break;
-        case TRACE_TYPE_MARKER:
-            cur_ref_.marker.type = (trace_type_t)input_entry_->type;
-            if (!online_ &&
-                (input_entry_->size == TRACE_MARKER_TYPE_VERSION ||
-                 input_entry_->size == TRACE_MARKER_TYPE_FILETYPE)) {
-                // Do not carry over a prior thread on a thread switch to a
-                // first-time-seen new thread, whose tid entry is *after* these
-                // markers for offline traces.
-                cur_pid_ = 0;
-                cur_tid_ = 0;
-            } else {
-                assert(cur_tid_ != 0 && cur_pid_ != 0);
-            }
-            cur_ref_.marker.pid = cur_pid_;
-            cur_ref_.marker.tid = cur_tid_;
-            cur_ref_.marker.marker_type = (trace_marker_type_t)input_entry_->size;
-            cur_ref_.marker.marker_value = input_entry_->addr;
-            // Look for timestamp+cpu duplicated from the prior chunk.  Skip them on
-            // a linear walk.
-            //
-            // TODO i#5538: On a seek, cache the duplicated headers, and update the
-            // cached timestamp and cpu as do linear portion of seek, and then emit
-            // cached top-level headers plus the last timestamp+cpu at the target
-            // point.
-            if (chunk_instr_count_ > 0 &&
-                cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP &&
-                cur_instr_count_ / chunk_instr_count_ !=
-                    last_timestamp_instr_count_ / chunk_instr_count_) {
-                skip_next_cpu_ = true;
-            } else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID &&
-                       skip_next_cpu_) {
-                skip_next_cpu_ = false;
+            prev_instr_addr_ = input_entry_->addr;
+            if (cur_ref_.instr.type != TRACE_TYPE_INSTR_NO_FETCH)
+                ++cur_instr_count_;
+            // Look for encoding bits that belong to this instr.
+            if (last_encoding_.size > 0) {
+                if (last_encoding_.size != cur_ref_.instr.size) {
+                    ERRMSG("Encoding size %zu != instr size %zu\n", last_encoding_.size,
+                           cur_ref_.instr.size);
+                    assert(false);
+                }
+                memcpy(cur_ref_.instr.encoding, last_encoding_.bits, last_encoding_.size);
+                cur_ref_.instr.encoding_is_new = true;
+                encodings_[cur_ref_.instr.addr] = last_encoding_;
             } else {
-                have_memref = true;
+                cur_ref_.instr.encoding_is_new = false;
+                const auto &it = encodings_.find(cur_ref_.instr.addr);
+                if (it != encodings_.end()) {
+                    memcpy(cur_ref_.instr.encoding, it->second.bits, it->second.size);
+                } else if (!expect_no_encodings_) {
+                    ERRMSG("Missing encoding for 0x%zx\n", cur_ref_.instr.addr);
+                    assert(false);
+                }
             }
-            if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP)
-                last_timestamp_instr_count_ = cur_instr_count_;
-            else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CHUNK_INSTR_COUNT)
-                chunk_instr_count_ = cur_ref_.marker.marker_value;
-            else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_FILETYPE &&
-                     TESTANY(OFFLINE_FILE_TYPE_ENCODINGS, cur_ref_.marker.marker_value))
-                expect_no_encodings_ = false;
-            break;
-        default:
-            ERRMSG("Unknown trace entry type %d\n", input_entry_->type);
-            assert(false);
-            at_eof_ = true; // bail
-            break;
+            last_encoding_.size = 0;
         }
-        if (have_memref) {
-            ++cur_ref_count_;
-            break;
+        break;
+    case TRACE_TYPE_INSTR_BUNDLE:
+        have_memref = true;
+        // The trace stream always has the instr fetch first, which we
+        // use to compute the starting PC for the subsequent instructions.
+        if (!(type_is_instr(cur_ref_.instr.type) ||
+              cur_ref_.instr.type == TRACE_TYPE_INSTR_NO_FETCH)) {
+            // XXX i#3320: Diagnostics to track down the elusive remaining case of
+            // this assert on Appveyor.  We'll remove and replace with just the
+            // assert once we have a fix.
+            ERRMSG("Invalid trace entry type %d before a bundle\n", cur_ref_.instr.type);
+            assert(type_is_instr(cur_ref_.instr.type) ||
+                   cur_ref_.instr.type == TRACE_TYPE_INSTR_NO_FETCH);
+        }
+        cur_ref_.instr.size = input_entry_->length[bundle_idx_++];
+        cur_pc_ = next_pc_;
+        cur_ref_.instr.addr = cur_pc_;
+        next_pc_ = cur_pc_ + cur_ref_.instr.size;
+        // input_entry_->size stores the number of instrs in this bundle
+        assert(input_entry_->size <= sizeof(input_entry_->length));
+        if (bundle_idx_ == input_entry_->size)
+            bundle_idx_ = 0;
+        break;
+    case TRACE_TYPE_INSTR_FLUSH:
+    case TRACE_TYPE_DATA_FLUSH:
+        assert(cur_tid_ != 0 && cur_pid_ != 0);
+        cur_ref_.flush.pid = cur_pid_;
+        cur_ref_.flush.tid = cur_tid_;
+        cur_ref_.flush.type = (trace_type_t)input_entry_->type;
+        cur_ref_.flush.size = input_entry_->size;
+        cur_ref_.flush.addr = input_entry_->addr;
+        if (cur_ref_.flush.size != 0)
+            have_memref = true;
+        break;
+    case TRACE_TYPE_INSTR_FLUSH_END:
+    case TRACE_TYPE_DATA_FLUSH_END:
+        cur_ref_.flush.size = input_entry_->addr - cur_ref_.flush.addr;
+        have_memref = true;
+        break;
+    case TRACE_TYPE_THREAD:
+        cur_tid_ = (memref_tid_t)input_entry_->addr;
+        // tid2pid might not be filled in yet: if so, we expect a
+        // TRACE_TYPE_PID entry right after this one, and later asserts
+        // will complain if it wasn't there.
+        cur_pid_ = tid2pid_[cur_tid_];
+        break;
+    case TRACE_TYPE_THREAD_EXIT:
+        cur_tid_ = (memref_tid_t)input_entry_->addr;
+        cur_pid_ = tid2pid_[cur_tid_];
+        assert(cur_tid_ != 0 && cur_pid_ != 0);
+        // We do pass this to the caller but only some fields are valid:
+        cur_ref_.exit.pid = cur_pid_;
+        cur_ref_.exit.tid = cur_tid_;
+        cur_ref_.exit.type = (trace_type_t)input_entry_->type;
+        have_memref = true;
+        break;
+    case TRACE_TYPE_PID:
+        cur_pid_ = (memref_pid_t)input_entry_->addr;
+        // We do want to replace, in case of tid reuse.
+        tid2pid_[cur_tid_] = cur_pid_;
+        break;
+    case TRACE_TYPE_MARKER:
+        cur_ref_.marker.type = (trace_type_t)input_entry_->type;
+        if (!online_ &&
+            (input_entry_->size == TRACE_MARKER_TYPE_VERSION ||
+             input_entry_->size == TRACE_MARKER_TYPE_FILETYPE)) {
+            // Do not carry over a prior thread on a thread switch to a
+            // first-time-seen new thread, whose tid entry is *after* these
+            // markers for offline traces.
+            cur_pid_ = 0;
+            cur_tid_ = 0;
+        } else {
+            assert(cur_tid_ != 0 && cur_pid_ != 0);
         }
+        cur_ref_.marker.pid = cur_pid_;
+        cur_ref_.marker.tid = cur_tid_;
+        cur_ref_.marker.marker_type = (trace_marker_type_t)input_entry_->size;
+        cur_ref_.marker.marker_value = input_entry_->addr;
+        // Look for timestamp+cpu duplicated from the prior chunk.  Skip them on
+        // a linear walk.  File readers that support seeking will read them
+        // and use them to start post-seek iteration.
+        if (chunk_instr_count_ > 0 &&
+            cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP &&
+            cur_instr_count_ / chunk_instr_count_ !=
+                last_timestamp_instr_count_ / chunk_instr_count_) {
+            VPRINT(this, 2, "skipping start-of-chunk dup timestamp\n");
+            skip_next_cpu_ = true;
+        } else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID &&
+                   skip_next_cpu_) {
+            VPRINT(this, 2, "skipping start-of-chunk dup cpu\n");
+            skip_next_cpu_ = false;
+        } else {
+            have_memref = true;
+        }
+        if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP)
+            last_timestamp_instr_count_ = cur_instr_count_;
+        else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_CHUNK_INSTR_COUNT)
+            chunk_instr_count_ = cur_ref_.marker.marker_value;
+        else if (cur_ref_.marker.marker_type == TRACE_MARKER_TYPE_FILETYPE &&
+                 TESTANY(OFFLINE_FILE_TYPE_ENCODINGS, cur_ref_.marker.marker_value))
+            expect_no_encodings_ = false;
+        break;
+    default:
+        ERRMSG("Unknown trace entry type %d\n", input_entry_->type);
+        assert(false);
+        at_eof_ = true; // bail
+        break;
     }
+    if (have_memref)
+        ++cur_ref_count_;
+    return have_memref;
+}
 
+reader_t &
+reader_t::skip_instructions(uint64_t instruction_count)
+{
+    // This base class has no fast seeking and must do a linear walk.
+    // We have +1 because we need to skip the memrefs of the final skipped
+    // instr, so we look for the 1st unskipped instr.
+    uint64_t stop_count_ = cur_instr_count_ + instruction_count + 1;
+    while (cur_instr_count_ < stop_count_) {
+        // TODO i#5538: Remember the last timestamp+cpu and insert it; share
+        // code with the zipfile reader.
+        // TODO i#5538: If skipping from the start, Record all of the header values
+        // until the first timestamp and present them as new memtrace_stream_t
+        // interfaces.
+        ++(*this);
+    }
     return *this;
 }
diff --git a/clients/drcachesim/reader/reader.h b/clients/drcachesim/reader/reader.h
index 490f7203f58..670476d1538 100644
--- a/clients/drcachesim/reader/reader.h
+++ b/clients/drcachesim/reader/reader.h
@@ -102,6 +102,18 @@ class reader_t : public std::iterator<std::input_iterator_tag, memref_t>,
     virtual reader_t &
     operator++();
 
+    // Skips records until "count" instruction records have been passed.
+    // This will skip top-level headers for a thread; it is up to the caller
+    // to first observe those before skipping, if needed.  For interleaved-thread
+    // iteration, top-level headers in other threads will be skipped as well
+    // (but generally speaking these are identical to the initial thread).
+    // TODO i#5538: Add access to these header values from #memtrace_stream_t
+    // and document it here.
+    // TODO i#5538: Skipping from the middle will not always duplicate the
+    // last timestamp,cpu.
+    virtual reader_t &
+    skip_instructions(uint64_t instruction_count);
+
     // Supplied for subclasses that may fail in their constructors.
     virtual bool
     operator!()
@@ -137,6 +149,10 @@ class reader_t : public std::iterator<std::input_iterator_tag, memref_t>,
     virtual bool
     read_next_thread_entry(size_t thread_index, OUT trace_entry_t *entry,
                            OUT bool *eof) = 0;
+    // This updates internal state for the just-read input_entry_.
+    // Returns whether a new memref record is now available.
+    virtual bool
+    process_input_entry();
 
     // Following typical stream iterator convention, the default constructor
     // produces an EOF object.
@@ -147,6 +163,11 @@ class reader_t : public std::iterator<std::input_iterator_tag, memref_t>,
     int verbosity_ = 0;
     bool online_ = true;
     const char *output_prefix_ = "[reader]";
+    uint64_t cur_ref_count_ = 0;
+    uint64_t cur_instr_count_ = 0;
+    uint64_t chunk_instr_count_ = 0; // Unchanging once set to non-zero.
+    uint64_t last_timestamp_instr_count_ = 0;
+    trace_entry_t *input_entry_ = nullptr;
 
 private:
     struct encoding_info_t {
@@ -154,7 +175,6 @@ class reader_t : public std::iterator<std::input_iterator_tag, memref_t>,
         unsigned char bits[MAX_ENCODING_LENGTH];
     };
 
-    trace_entry_t *input_entry_ = nullptr;
     memref_t cur_ref_;
     memref_tid_t cur_tid_ = 0;
     memref_pid_t cur_pid_ = 0;
@@ -163,10 +183,6 @@ class reader_t : public std::iterator<std::input_iterator_tag, memref_t>,
     addr_t prev_instr_addr_ = 0;
     int bundle_idx_ = 0;
     std::unordered_map<memref_tid_t, memref_pid_t> tid2pid_;
-    uint64_t cur_ref_count_ = 0;
-    uint64_t cur_instr_count_ = 0;
-    uint64_t chunk_instr_count_ = 0; // Unchanging once set to non-zero.
-    uint64_t last_timestamp_instr_count_ = 0;
     bool skip_next_cpu_ = false;
     bool expect_no_encodings_ = true;
     encoding_info_t last_encoding_;
diff --git a/clients/drcachesim/reader/zipfile_file_reader.cpp b/clients/drcachesim/reader/zipfile_file_reader.cpp
index a1764460617..062d560ad42 100644
--- a/clients/drcachesim/reader/zipfile_file_reader.cpp
+++ b/clients/drcachesim/reader/zipfile_file_reader.cpp
@@ -31,6 +31,7 @@
  */
 
 #include "zipfile_file_reader.h"
+#include <inttypes.h>
 
 // We use minizip, which is in the contrib/minizip directory in the zlib
 // sources.  The docs are the header files:
@@ -130,4 +131,103 @@ file_reader_t<zipfile_reader_t>::is_complete()
     return false;
 }
 
-// TODO i#5538: Implement seeking via unzLocateFile.
+template <>
+bool
+file_reader_t<zipfile_reader_t>::skip_thread_instructions(size_t thread_index,
+                                                          uint64_t instruction_count,
+                                                          OUT bool *eof)
+{
+    if (instruction_count == 0)
+        return true;
+    VPRINT(this, 2, "Thread #%zd skipping %" PRIi64 " instrs\n", thread_index,
+           instruction_count);
+    trace_entry_t timestamp = {};
+    trace_entry_t cpu = {};
+    const memref_t &memref = **this;
+    if (memref.marker.type == TRACE_TYPE_MARKER &&
+        memref.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP) {
+        timestamp = entry_copy_;
+    }
+    zipfile_reader_t *zipfile = &input_files_[thread_index];
+    // We assume our unzGoToNextFile loop is plenty performant and we don't need to
+    // know the chunk names to use with a single unzLocateFile.
+    uint64_t stop_count_ = cur_instr_count_ + instruction_count + 1;
+    VPRINT(this, 2,
+           "stop=%" PRIi64 " cur=%" PRIi64 " chunk=%" PRIi64 " est=%" PRIi64 "\n",
+           stop_count_, cur_instr_count_, chunk_instr_count_,
+           cur_instr_count_ +
+               (chunk_instr_count_ - (cur_instr_count_ % chunk_instr_count_)));
+    // First, quickly skip over chunks to reach the chunk containing the target.
+    while (cur_instr_count_ +
+               (chunk_instr_count_ - (cur_instr_count_ % chunk_instr_count_)) <
+           stop_count_) {
+        if (unzCloseCurrentFile(zipfile->file) != UNZ_OK)
+            return false;
+        int res = unzGoToNextFile(zipfile->file);
+        if (res != UNZ_OK) {
+            if (res == UNZ_END_OF_LIST_OF_FILE) {
+                VPRINT(this, 2, "Thread #%zd hit EOF\n", thread_index);
+                *eof = true;
+            }
+            return false;
+        }
+        if (unzOpenCurrentFile(zipfile->file) != UNZ_OK)
+            return false;
+        cur_instr_count_ += chunk_instr_count_ - (cur_instr_count_ % chunk_instr_count_);
+        VPRINT(this, 2, "Thread #%zd at %" PRIi64 " instrs at start of new chunk\n",
+               thread_index, cur_instr_count_);
+        VPRINT(this, 2,
+               "zip chunk stop=%" PRIi64 " cur=%" PRIi64 " chunk=%" PRIi64
+               " end-of-chunk=%" PRIi64 "\n",
+               stop_count_, cur_instr_count_, chunk_instr_count_,
+               cur_instr_count_ +
+                   (chunk_instr_count_ - (cur_instr_count_ % chunk_instr_count_)));
+        // Clear cached data from the prior chunk.
+        zipfile->cur_buf = zipfile->max_buf;
+    }
+    // We have to linearly walk the last mile.
+    // We need to present a timestamp+cpu so we reset this field so process_input_entry()
+    // will not skip the first pair in this new chunk.
+    last_timestamp_instr_count_ = cur_instr_count_;
+    while (cur_instr_count_ < stop_count_) { // End condition is never reached.
+        if (!read_next_thread_entry(thread_index, &entry_copy_, eof))
+            return false;
+        // We need to pass up memrefs for the final skipped instr, but we don't
+        // want to process_input_entry() on the first unskipped instr so we can
+        // insert the timestamp+cpu first.
+        if (cur_instr_count_ + 1 == stop_count_ &&
+            type_is_instr(static_cast<trace_type_t>(entry_copy_.type)))
+            break;
+        // Update core state.
+        input_entry_ = &entry_copy_;
+        if (!process_input_entry())
+            continue;
+        const memref_t &memref = **this;
+        if (memref.marker.type == TRACE_TYPE_MARKER) {
+            if (memref.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP)
+                timestamp = entry_copy_;
+            else if (memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID)
+                cpu = entry_copy_;
+        }
+        // TODO i#5538: Have raw2trace insert a record ordinal marker at chunk entry
+        // and use it here to udpate the memtrace_stream_t.
+    }
+    if (timestamp.type == TRACE_TYPE_MARKER && cpu.type == TRACE_TYPE_MARKER) {
+        // Insert the two markers.
+        // TODO i#5538: These end up with record ordinals that belong to different
+        // records in the unskipped trace: we should instead not print them out
+        // at all, somehow.
+        trace_entry_t instr = entry_copy_;
+        entry_copy_ = timestamp;
+        process_input_entry();
+        queues_[thread_index].push(cpu);
+        queues_[thread_index].push(instr);
+    } else {
+        // We missed the markers somehow; fall back to just process the instr.
+        // TODO i#5538: For skipping from the middle we need to have the
+        // base reader cache the last timestamp,cpu.
+        VPRINT(this, 1, "Skip failed to find both timestamp and cpu\n");
+        process_input_entry();
+    }
+    return true;
+}
diff --git a/clients/drcachesim/reader/zipfile_file_reader.h b/clients/drcachesim/reader/zipfile_file_reader.h
index 1d6d45a275e..3caea97082f 100644
--- a/clients/drcachesim/reader/zipfile_file_reader.h
+++ b/clients/drcachesim/reader/zipfile_file_reader.h
@@ -55,4 +55,13 @@ struct zipfile_reader_t {
 
 typedef file_reader_t<zipfile_reader_t> zipfile_file_reader_t;
 
+/* Declare this so the compiler knows not to use the default implementation in the
+ * class declaration.
+ */
+template <>
+bool
+file_reader_t<zipfile_reader_t>::skip_thread_instructions(size_t thread_index,
+                                                          uint64_t instruction_count,
+                                                          OUT bool *eof);
+
 #endif /* _ZIPFILE_FILE_READER_H_ */
diff --git a/clients/drcachesim/tests/drmemtrace.allasm_x86_64.trace.zip b/clients/drcachesim/tests/drmemtrace.allasm_x86_64.trace.zip
new file mode 100644
index 00000000000..4baed857fbf
Binary files /dev/null and b/clients/drcachesim/tests/drmemtrace.allasm_x86_64.trace.zip differ
diff --git a/clients/drcachesim/tests/skip_unit_tests.cpp b/clients/drcachesim/tests/skip_unit_tests.cpp
new file mode 100644
index 00000000000..046005b18fd
--- /dev/null
+++ b/clients/drcachesim/tests/skip_unit_tests.cpp
@@ -0,0 +1,169 @@
+/* **********************************************************
+ * Copyright (c) 2022 Google, Inc.  All rights reserved.
+ * **********************************************************/
+
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Google, Inc. nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/* Unit tests for the skip feature. */
+
+#include "droption.h"
+#include "zipfile_file_reader.h"
+#include "tools/view_create.h"
+
+#include <iostream>
+#include <memory>
+
+#ifndef HAS_ZIP
+#    error zipfile reading is required for this test
+#endif
+
+#define FATAL_ERROR(msg, ...)                               \
+    do {                                                    \
+        fprintf(stderr, "ERROR: " msg "\n", ##__VA_ARGS__); \
+        fflush(stderr);                                     \
+        exit(1);                                            \
+    } while (0)
+
+#define CHECK(cond, msg, ...)             \
+    do {                                  \
+        if (!(cond)) {                    \
+            fprintf(stderr, "%s\n", msg); \
+            return false;                 \
+        }                                 \
+    } while (0)
+
+static droption_t<std::string> op_trace_file(DROPTION_SCOPE_FRONTEND, "trace_file", "",
+                                             "[Required] Trace input .zip file",
+                                             "Specifies the input .zip trace file.");
+
+static droption_t<bool> op_verbose(DROPTION_SCOPE_FRONTEND, "verbose", false,
+                                   "Whether to print diagnostics",
+                                   "Whether to print diagnostics");
+
+bool
+test_skip_initial()
+{
+    int view_count = 10;
+    // Our checked-in trace has a chunk size of 20, letting us test cross-chunk
+    // skips.  We verify the chunk size below to ensure updates to that file
+    // remember to set that value.
+    // We check each skip value to ensure the view tool is outputting the
+    // expected instruction count.
+    for (int skip_instrs = 0; skip_instrs < 50; skip_instrs++) {
+        if (op_verbose.get_value())
+            std::cout << "Testing -skip_instrs " << skip_instrs << "\n";
+        // Capture cerr.
+        std::stringstream capture;
+        std::streambuf *prior = std::cerr.rdbuf(capture.rdbuf());
+        // Open the trace.
+        std::unique_ptr<reader_t> iter = std::unique_ptr<reader_t>(
+            new zipfile_file_reader_t(op_trace_file.get_value()));
+        CHECK(!!iter, "failed to open zipfile");
+        CHECK(iter->init(), "failed to initialize reader");
+        std::unique_ptr<reader_t> iter_end =
+            std::unique_ptr<reader_t>(new zipfile_file_reader_t());
+        // Run the tool.
+        std::unique_ptr<analysis_tool_t> tool =
+            std::unique_ptr<analysis_tool_t>(view_tool_create(
+                "", /*thread=*/0, /*skip_refs=*/0, /*sim_refs=*/view_count, "att"));
+        std::string error = tool->initialize_stream(iter.get());
+        CHECK(error.empty(), error.c_str());
+        iter->skip_instructions(skip_instrs);
+        for (; *iter != *iter_end; ++(*iter)) {
+            const memref_t &memref = **iter;
+            CHECK(tool->process_memref(memref), tool->get_error_string().c_str());
+        }
+        // Check the result.
+        std::string res = capture.str();
+        if (op_verbose.get_value())
+            std::cout << "Got: |" << res << "|\n";
+        CHECK(skip_instrs != 0 ||
+                  res.find("chunk instruction count 20") != std::string::npos,
+              "expecting chunk size of 20 in test trace");
+        std::stringstream res_stream(res);
+        // Example output for -skip_instrs 49:
+        //    Output format:
+        //    <record#> <instr#>: T<tid> <record details>
+        //    ------------------------------------------------------------
+        //           18       49: T3854659 <marker: timestamp 13312570674112282>
+        //           19       49: T3854659 <marker: tid 3854659 on core 3>
+        //           20       50: T3854659 ifetch       2 byte(s) @ 0x0000000000401030 75
+        //           d9                jnz    $0x000000000040100b
+        std::string line;
+        // First we expect "Output format:"
+        std::getline(res_stream, line, '\n');
+        CHECK(starts_with(line, "Output format"), "missing header");
+        // Next we expect "<record#> <instr#>: T<tid> <record details>"
+        std::getline(res_stream, line, '\n');
+        CHECK(starts_with(line, "<record#> <instr#>"), "missing 2nd header");
+        // Next we expect "------------------------------------------------------------"
+        std::getline(res_stream, line, '\n');
+        CHECK(starts_with(line, "------"), "missing divider line");
+        // Next we expect the timestamp entry with the instruction count before
+        // a colon: "       18       49: T3854659 <marker: timestamp 13312570674112282>"
+        // We expect the count to equal the -skip_instrs value.
+        std::getline(res_stream, line, '\n');
+        std::stringstream expect_stream;
+        expect_stream << skip_instrs << ":";
+        CHECK(line.find(expect_stream.str()) != std::string::npos, "bad instr ordinal");
+        // TODO i#5538: Once we fix the record count, check it too to ensure it
+        // is > the instr count or other sanity checks.
+        CHECK(skip_instrs == 0 || line.find("timestamp") != std::string::npos,
+              "missing timestamp");
+        // Next we expect the cpuid entry.
+        std::getline(res_stream, line, '\n');
+        CHECK(skip_instrs == 0 || line.find("on core") != std::string::npos,
+              "missing cpuid");
+        // Next we expect the target instruction fetch.
+        std::getline(res_stream, line, '\n');
+        CHECK(skip_instrs == 0 || line.find("ifetch") != std::string::npos,
+              "missing ifetch");
+        // Reset cerr.
+        std::cerr.rdbuf(prior);
+    }
+    return true;
+}
+
+int
+main(int argc, const char *argv[])
+{
+    std::string parse_err;
+    if (!droption_parser_t::parse_argv(DROPTION_SCOPE_FRONTEND, argc, (const char **)argv,
+                                       &parse_err, NULL) ||
+        op_trace_file.get_value().empty()) {
+        FATAL_ERROR("Usage error: %s\nUsage:\n%s", parse_err.c_str(),
+                    droption_parser_t::usage_short(DROPTION_SCOPE_ALL).c_str());
+    }
+    if (!test_skip_initial())
+        return 1;
+    // TODO i#5538: Add tests that skip from the middle once we have full support
+    // for duplicating the timestamp,cpu in that scenario.
+    fprintf(stderr, "Success\n");
+    return 0;
+}
diff --git a/clients/drcachesim/tools/view.h b/clients/drcachesim/tools/view.h
index 24819e81fe6..cb6fea8ee95 100644
--- a/clients/drcachesim/tools/view.h
+++ b/clients/drcachesim/tools/view.h
@@ -96,9 +96,15 @@ class view_t : public analysis_tool_t {
     print_prefix(memtrace_stream_t *memstream, const memref_t &memref, int ref_adjust = 0,
                  std::ostream &stream = std::cerr)
     {
-        if (prev_tid_ != -1 && prev_tid_ != memref.instr.tid)
+        if ((prev_tid_ != -1 && prev_tid_ != memref.instr.tid) ||
+            // Print a divider for a skip_instructions gap too.
+            (prev_record_ != 0 &&
+             prev_record_ + 1 < memstream->get_record_ordinal() + ref_adjust))
             stream << "------------------------------------------------------------\n";
         prev_tid_ = memref.instr.tid;
+        // TODO i#5538: After skipping across a chunk, the record ordinal is
+        // incorrect.  We need raw2trace to insert a record ordinal at chunk entry.
+        prev_record_ = memstream->get_record_ordinal() + ref_adjust;
         stream << std::setw(9) << (memstream->get_record_ordinal() + ref_adjust)
                << std::setw(9) << memstream->get_instruction_ordinal() << ": T"
                << memref.marker.tid << " ";
@@ -130,6 +136,7 @@ class view_t : public analysis_tool_t {
     uint64_t num_disasm_instrs_;
     std::unordered_map<app_pc, std::string> disasm_cache_;
     memref_tid_t prev_tid_;
+    uint64_t prev_record_ = 0;
     intptr_t filetype_;
     std::unordered_set<memref_tid_t> printed_header_;
     std::unordered_map<memref_tid_t, uintptr_t> last_window_;
diff --git a/clients/drcachesim/tracer/raw2trace.cpp b/clients/drcachesim/tracer/raw2trace.cpp
index e366ed13ced..daae4b8f1cc 100644
--- a/clients/drcachesim/tracer/raw2trace.cpp
+++ b/clients/drcachesim/tracer/raw2trace.cpp
@@ -1225,6 +1225,7 @@ raw2trace_t::open_new_chunk(raw2trace_thread_data_t *tdata)
     // We need to clear the encoding cache so that each chunk is self-contained
     // and repeats all encodings used inside it.
     tdata->encoding_emitted.clear();
+    tdata->last_encoding_emitted = nullptr;
 
     // TODO i#5538: Add a virtual-to-physical cache and clear it here.
     // We'll need to add a routine for trace_converter_t to call to query our cache --