Skip to content

Commit

Permalink
fix: Fix VTT cue timing in HLS (shaka-project#4217)
Browse files Browse the repository at this point in the history
Since the transition to sequence mode for HLS in v4.0.0, VTT cue
timings were broken.  This is mainly because VTT cue timing in HLS is
meant to be based on an offset from the media timestamps, and we
generally don't know those now that we use sequence mode.

To fix it, this change uses MediaSource segment mode for the very
first video segment as a way to extract the timestamp, then clears the
buffer, switches to sequence mode, and appends it again.  This lets us
get the timing data we need, while avoiding major drawbacks of the
previous HLS implementation:
 - We don't need to fetch segments upfront (which is high latency)
 - We don't need to fetch segments twice (once for timestamps, and
   once again to buffer)
 - We don't need to maintain parsers (which were complex and limited
   the formats we could support)

Closes shaka-project#4191
  • Loading branch information
joeyparrish committed May 17, 2022
1 parent 24eac2c commit 69d1c14
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 51 deletions.
128 changes: 91 additions & 37 deletions lib/media/media_source_engine.js
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ shaka.media.MediaSourceEngine = class {

/** @private {string} */
this.url_ = '';

/** @private {boolean} */
this.sequenceMode_ = false;

/** @private {!shaka.util.PublicPromise.<number>} */
this.textSequenceModeOffset_ = new shaka.util.PublicPromise();
}

/**
Expand Down Expand Up @@ -331,6 +337,8 @@ shaka.media.MediaSourceEngine = class {

await this.mediaSourceOpen_;

this.sequenceMode_ = sequenceMode;

for (const contentType of streamsByType.keys()) {
const stream = streamsByType.get(contentType);
goog.asserts.assert(
Expand All @@ -348,11 +356,9 @@ shaka.media.MediaSourceEngine = class {
mimeType =
shaka.media.Transmuxer.convertTsCodecs(contentType, mimeType);
}

const sourceBuffer = this.mediaSource_.addSourceBuffer(mimeType);
if (sequenceMode) {
sourceBuffer.mode =
shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;
}

this.eventManager_.listen(
sourceBuffer, 'error',
() => this.onError_(contentType));
Expand Down Expand Up @@ -515,35 +521,29 @@ shaka.media.MediaSourceEngine = class {
* @param {?boolean} hasClosedCaptions True if the buffer contains CEA closed
* captions
* @param {boolean=} seeked True if we just seeked
* @param {boolean=} sequenceMode True if sequence mode
* @return {!Promise}
*/
async appendBuffer(contentType, data, startTime, endTime, hasClosedCaptions,
seeked, sequenceMode) {
async appendBuffer(
contentType, data, startTime, endTime, hasClosedCaptions, seeked) {
const ContentType = shaka.util.ManifestParserUtils.ContentType;

if (startTime != null && sequenceMode && contentType != ContentType.TEXT) {
// If we just cleared buffer and is on an unbuffered seek, we need to set
// the new timestampOffset of the sourceBuffer.
// Don't do this for text streams, though, since they don't use
// MediaSource anyway.
if (seeked) {
const timestampOffset = /** @type {number} */ (startTime);
this.enqueueOperation_(
contentType,
() => this.setTimestampOffset_(contentType, timestampOffset));
if (contentType == ContentType.TEXT) {
if (this.sequenceMode_) {
// This won't be known until the first video segment is appended.
const offset = await this.textSequenceModeOffset_;
this.textEngine_.setTimestampOffset(offset);
}
await this.textEngine_.appendBuffer(data, startTime, endTime);
return;
}

if (contentType == ContentType.TEXT) {
await this.textEngine_.appendBuffer(data, startTime, endTime);
} else if (this.transmuxers_[contentType]) {
if (this.transmuxers_[contentType]) {
const transmuxedData =
await this.transmuxers_[contentType].transmux(data);
// For HLS CEA-608/708 CLOSED-CAPTIONS, text data is embedded in
// the video stream, so textEngine may not have been initialized.
if (!this.textEngine_) {
this.reinitText('text/vtt', sequenceMode || false);
this.reinitText('text/vtt', this.sequenceMode_);
}

if (transmuxedData.metadata) {
Expand All @@ -562,15 +562,10 @@ shaka.media.MediaSourceEngine = class {
closedCaptions, startTime, endTime, videoOffset);
}

let transmuxedSegment = transmuxedData.data;
transmuxedSegment = this.workAroundBrokenPlatforms_(
transmuxedSegment, startTime, contentType);

await this.enqueueOperation_(
contentType, () => this.append_(contentType, transmuxedSegment));
data = transmuxedData.data;
} else if (hasClosedCaptions) {
if (!this.textEngine_) {
this.reinitText('text/vtt', sequenceMode || false);
this.reinitText('text/vtt', this.sequenceMode_);
}
// If it is the init segment for closed captions, initialize the closed
// caption parser.
Expand All @@ -585,19 +580,78 @@ shaka.media.MediaSourceEngine = class {
closedCaptions, startTime, endTime, videoOffset);
}
}
}

data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
data = this.workAroundBrokenPlatforms_(data, startTime, contentType);

const sourceBuffer = this.sourceBuffers_[contentType];
const SEQUENCE = shaka.media.MediaSourceEngine.SourceBufferMode_.SEQUENCE;

if (this.sequenceMode_ && sourceBuffer.mode != SEQUENCE &&
startTime != null) {
// This is the first media segment to be appended to a SourceBuffer in
// sequence mode. We set the mode late so that we can trick MediaSource
// into extracting a timestamp for us to align text segments in sequence
// mode.

// Timestamps can only be reliably extracted from video, not audio.
// Packed audio formats do not have internal timestamps at all.
// Prefer video for this when available.
const isBestSourceBufferForTimestamps =
contentType == ContentType.VIDEO ||
!(ContentType.VIDEO in this.sourceBuffers_);
if (isBestSourceBufferForTimestamps) {
// Append the segment in segments mode first, with offset of 0 and an
// open append window.
const originalRange =
[sourceBuffer.appendWindowStart, sourceBuffer.appendWindowEnd];
sourceBuffer.appendWindowStart = 0;
sourceBuffer.appendWindowEnd = Infinity;

const originalOffset = sourceBuffer.timestampOffset;
sourceBuffer.timestampOffset = 0;

await this.enqueueOperation_(
contentType, () => this.append_(contentType, data));

// Reset the offset and append window.
sourceBuffer.timestampOffset = originalOffset;
sourceBuffer.appendWindowStart = originalRange[0];
sourceBuffer.appendWindowEnd = originalRange[1];

// Now get the timestamp of the segment and compute the offset for text
// segments.
const mediaStartTime = shaka.media.TimeRangesUtils.bufferStart(
this.getBuffered_(contentType));
const textOffset = (startTime || 0) - (mediaStartTime || 0);
this.textSequenceModeOffset_.resolve(textOffset);

// Finally, clear the buffer.
await this.enqueueOperation_(
contentType,
() => this.remove_(contentType, 0, this.mediaSource_.duration));
}

await this.enqueueOperation_(
contentType,
() => this.append_(contentType, data));
} else {
data = this.workAroundBrokenPlatforms_(data, startTime, contentType);
// Now switch to sequence mode and fall through to our normal operations.
sourceBuffer.mode = SEQUENCE;
}

await this.enqueueOperation_(
contentType,
() => this.append_(contentType, data));
if (startTime != null && this.sequenceMode_ &&
contentType != ContentType.TEXT) {
// In sequence mode, for non-text streams, if we just cleared the buffer
// and are performing an unbuffered seek, we need to set a new
// timestampOffset on the sourceBuffer.
if (seeked) {
const timestampOffset = /** @type {number} */ (startTime);
this.enqueueOperation_(
contentType,
() => this.setTimestampOffset_(contentType, timestampOffset));
}
}

await this.enqueueOperation_(
contentType,
() => this.append_(contentType, data));
}

/**
Expand Down
3 changes: 1 addition & 2 deletions lib/media/streaming_engine.js
Original file line number Diff line number Diff line change
Expand Up @@ -1605,8 +1605,7 @@ shaka.media.StreamingEngine = class {
reference.syncTime == null ? reference.startTime : reference.syncTime,
reference.endTime,
hasClosedCaptions,
seeked,
this.manifest_.sequenceMode);
seeked);
this.destroyer_.ensureNotDestroyed();
shaka.log.v2(logPrefix, 'appended media segment');
}
Expand Down
14 changes: 7 additions & 7 deletions lib/text/vtt_text_parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,15 @@ shaka.text.VttTextParser = class {
// to the beginning of each segment.
// NOTE: "periodStart" is the timestamp offset applied via TextEngine.
// It is no longer closely tied to periods, but the name stuck around.
// NOTE: This offset and the flag choosing its meaning have no effect on
// HLS content, which should use X-TIMESTAMP-MAP and periodStart instead.
let offset = time.vttOffset;

// Do not honor the 'X-TIMESTAMP-MAP' value when in sequence mode.
// That is because it is used mainly (solely?) to account for the timestamp
// offset of the video/audio; when in sequence mode, we normalize that
// timestamp offset to 0, so we should not account for it.
if (blocks[0].includes('X-TIMESTAMP-MAP') && !this.sequenceMode_) {
// Only use 'X-TIMESTAMP-MAP' in sequence mode, as that is currently
// shorthand for HLS. Note that an offset based on the first video
// timestamp has already been extracted, and appears in periodStart.
// The relative offset from X-TIMESTAMP-MAP will be added to that for HLS.
if (blocks[0].includes('X-TIMESTAMP-MAP') && this.sequenceMode_) {
// https://bit.ly/2K92l7y
// The 'X-TIMESTAMP-MAP' header is used in HLS to align text with
// the rest of the media.
Expand Down Expand Up @@ -109,8 +111,6 @@ shaka.text.VttTextParser = class {
mpegTime += shaka.text.VttTextParser.TS_ROLLOVER_;
}

// Apple-encoded HLS content uses absolute timestamps, so assume the
// presence of the map tag means the content uses absolute timestamps.
offset = time.periodStart + mpegTime / mpegTimescale - cueTime;
}
}
Expand Down
69 changes: 64 additions & 5 deletions test/text/vtt_text_parser_unit.js
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,61 @@ describe('VttTextParser', () => {
'Test\n\n' +
'00:00:40.000 --> 00:00:50.000 line:-1\n' +
'Test2',
{periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0});
{periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
/* sequenceMode= */ true);
});

it('ignores X-TIMESTAMP-MAP header if not in sequence mode', () => {
verifyHelper(
[
{startTime: 20, endTime: 40, payload: 'Test'},
{startTime: 40, endTime: 50, payload: 'Test2'},
],
'WEBVTT\n' +
'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:01:00:00.000\n\n' +
'00:00:20.000 --> 00:00:40.000 line:0\n' +
'Test\n\n' +
'00:00:40.000 --> 00:00:50.000 line:-1\n' +
'Test2',
{periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
/* sequenceMode= */ false);
});

it('parses X-TIMESTAMP-MAP header with non-zero local base', () => {
verifyHelper(
[
{startTime: 1800, endTime: 1810, payload: 'Test'},
{startTime: 1820, endTime: 1830, payload: 'Test2'},
],
// 162000000 = 30 * 60 * 90k = 30 minutes for the TS part of the map.
// The local (VTT) part of the map is 1 hour.
// So text times of 1 hour map to media times of 30 minutes.
'WEBVTT\n' +
'X-TIMESTAMP-MAP=MPEGTS:162000000,LOCAL:01:00:00.000\n\n' +
'01:00:00.000 --> 01:00:10.000 line:0\n' +
'Test\n\n' +
'01:00:20.000 --> 01:00:30.000 line:-1\n' +
'Test2',
{periodStart: 0, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
/* sequenceMode= */ true);
});

it('combines X-TIMESTAMP-MAP header with periodStart', () => {
verifyHelper(
[
{startTime: 130, endTime: 150, payload: 'Test'},
{startTime: 150, endTime: 160, payload: 'Test2'},
],
// 900000 = 10 sec, so expect every timestamp to be 10
// seconds ahead of what is specified.
'WEBVTT\n' +
'X-TIMESTAMP-MAP=MPEGTS:900000,LOCAL:00:00:00.000\n\n' +
'00:00:20.000 --> 00:00:40.000 line:0\n' +
'Test\n\n' +
'00:00:40.000 --> 00:00:50.000 line:-1\n' +
'Test2',
{periodStart: 100, segmentStart: 25, segmentEnd: 65, vttOffset: 0},
/* sequenceMode= */ true);
});

it('handles timestamp rollover with X-TIMESTAMP-MAP header', () => {
Expand All @@ -551,7 +605,8 @@ describe('VttTextParser', () => {
'Test',
// Non-null segmentStart takes precedence over X-TIMESTAMP-MAP.
// This protects us from rollover in the MPEGTS field.
{periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0});
{periodStart: 0, segmentStart: 95440, segmentEnd: 95550, vttOffset: 0},
/* sequenceMode= */ true);

verifyHelper(
[
Expand All @@ -564,7 +619,8 @@ describe('VttTextParser', () => {
'X-TIMESTAMP-MAP=MPEGTS:9745408,LOCAL:00:00:00.000\n\n' +
'00:00:00.000 --> 00:00:02.000 line:0\n' +
'Test2',
{periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0});
{periodStart: 0, segmentStart: 95550, segmentEnd: 95560, vttOffset: 0},
/* sequenceMode= */ true);
});

it('supports global style blocks', () => {
Expand Down Expand Up @@ -978,11 +1034,14 @@ describe('VttTextParser', () => {
* @param {!Array} cues
* @param {string} text
* @param {shaka.extern.TextParser.TimeContext} time
* @param {boolean=} sequenceMode
*/
function verifyHelper(cues, text, time) {
function verifyHelper(cues, text, time, sequenceMode = false) {
const data =
shaka.util.BufferUtils.toUint8(shaka.util.StringUtils.toUTF8(text));
const result = new shaka.text.VttTextParser().parseMedia(data, time);
const parser = new shaka.text.VttTextParser();
parser.setSequenceMode(sequenceMode);
const result = parser.parseMedia(data, time);

const expected = cues.map((cue) => {
if (cue.nestedCues) {
Expand Down

0 comments on commit 69d1c14

Please sign in to comment.