diff --git a/lib/text/web_vtt_generator.js b/lib/text/web_vtt_generator.js
index aeb222483c..cfa9fbc554 100644
--- a/lib/text/web_vtt_generator.js
+++ b/lib/text/web_vtt_generator.js
@@ -81,16 +81,28 @@ shaka.text.WebVttGenerator = class {
     // We don't want to modify the array or objects passed in, since we don't
     // technically own them.  So we build a new array and replace certain items
     // in it if they need to be flattened.
-    const flattenedCues = cues.map((cue) => {
-      if (cue.nestedCues.length) {
-        const flatCue = cue.clone();
-        flatCue.nestedCues = [];
-        flatCue.payload = flattenPayload(cue);
-        return flatCue;
-      } else {
-        return cue;
+    // We also don't want to flatten the text payloads starting at a container
+    // element; otherwise, for containers encapsulating multiple caption lines,
+    // the lines would merge into a single cue. This is undesirable when a
+    // subset of the captions are outside of the append time window. To fix
+    // this, we only call flattenPayload() starting at elements marked as
+    // isContainer = false.
+    const getCuesToFlatten = (cues, result) => {
+      for (const cue of cues) {
+        if (cue.isContainer) {
+          // Recurse to find the actual text payload cues.
+          getCuesToFlatten(cue.nestedCues, result);
+        } else {
+          // Flatten the payload.
+          const flatCue = cue.clone();
+          flatCue.nestedCues = [];
+          flatCue.payload = flattenPayload(cue);
+          result.push(flatCue);
+        }
       }
-    });
+      return result;
+    };
+    const flattenedCues = getCuesToFlatten(cues, []);
 
     let webvttString = 'WEBVTT\n\n';
     for (const cue of flattenedCues) {