diff --git a/README.md b/README.md index 34c906cb..471b8567 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,12 @@ Set to `true` to remux audio and video into a single MP4 segment. This module reads CEA-608 captions out of FMP4 segments. +#### WebVTTParser + +`muxjs.mp4.WebVTTParser` + +This module reads WebVTT text out of FMP4 segments. + #### Tools `muxjs.mp4.tools` diff --git a/lib/mp4/caption-parser.js b/lib/mp4/caption-parser.js index 8c4e3957..7a8e9261 100644 --- a/lib/mp4/caption-parser.js +++ b/lib/mp4/caption-parser.js @@ -13,9 +13,8 @@ var discardEmulationPreventionBytes = require('../tools/caption-packet-parser'). var CaptionStream = require('../m2ts/caption-stream').CaptionStream; var findBox = require('../mp4/find-box.js'); var parseTfdt = require('../tools/parse-tfdt.js'); -var parseTrun = require('../tools/parse-trun.js'); var parseTfhd = require('../tools/parse-tfhd.js'); -var window = require('global/window'); +var { getMdatTrafPairs, parseSamples } = require('./samples.js'); /** * Maps an offset in the mdat to a sample based on the the size of the samples. @@ -118,62 +117,6 @@ var findSeiNals = function(avcStream, samples, trackId) { return result; }; -/** - * Parses sample information out of Track Run Boxes and calculates - * the absolute presentation and decode timestamps of each sample. - * - * @param {Array} truns - The Trun Run boxes to be parsed - * @param {Number|BigInt} baseMediaDecodeTime - base media decode time from tfdt - @see ISO-BMFF-12/2015, Section 8.8.12 - * @param {Object} tfhd - The parsed Track Fragment Header - * @see inspect.parseTfhd - * @return {Object[]} the parsed samples - * - * @see ISO-BMFF-12/2015, Section 8.8.8 - **/ -var parseSamples = function(truns, baseMediaDecodeTime, tfhd) { - var currentDts = baseMediaDecodeTime; - var defaultSampleDuration = tfhd.defaultSampleDuration || 0; - var defaultSampleSize = tfhd.defaultSampleSize || 0; - var trackId = tfhd.trackId; - var allSamples = []; - - truns.forEach(function(trun) { - // Note: We currently do not parse the sample table as well - // as the trun. It's possible some sources will require this. - // moov > trak > mdia > minf > stbl - var trackRun = parseTrun(trun); - var samples = trackRun.samples; - - samples.forEach(function(sample) { - if (sample.duration === undefined) { - sample.duration = defaultSampleDuration; - } - if (sample.size === undefined) { - sample.size = defaultSampleSize; - } - sample.trackId = trackId; - sample.dts = currentDts; - if (sample.compositionTimeOffset === undefined) { - sample.compositionTimeOffset = 0; - } - - if (typeof currentDts === 'bigint') { - sample.pts = currentDts + window.BigInt(sample.compositionTimeOffset); - currentDts += window.BigInt(sample.duration); - - } else { - sample.pts = currentDts + sample.compositionTimeOffset; - currentDts += sample.duration; - } - }); - - allSamples = allSamples.concat(samples); - }); - - return allSamples; -}; - /** * Parses out caption nals from an FMP4 segment's video tracks. * @@ -183,21 +126,8 @@ var parseSamples = function(truns, baseMediaDecodeTime, tfhd) { * a list of seiNals found in that track **/ var parseCaptionNals = function(segment, videoTrackId) { - // To get the samples - var trafs = findBox(segment, ['moof', 'traf']); - // To get SEI NAL units - var mdats = findBox(segment, ['mdat']); var captionNals = {}; - var mdatTrafPairs = []; - - // Pair up each traf with a mdat as moofs and mdats are in pairs - mdats.forEach(function(mdat, index) { - var matchingTraf = trafs[index]; - mdatTrafPairs.push({ - mdat: mdat, - traf: matchingTraf - }); - }); + var mdatTrafPairs = getMdatTrafPairs(segment); mdatTrafPairs.forEach(function(pair) { var mdat = pair.mdat; diff --git a/lib/mp4/index.js b/lib/mp4/index.js index 15b9123e..0b782c82 100644 --- a/lib/mp4/index.js +++ b/lib/mp4/index.js @@ -10,5 +10,6 @@ module.exports = { Transmuxer: require('./transmuxer').Transmuxer, AudioSegmentStream: require('./transmuxer').AudioSegmentStream, VideoSegmentStream: require('./transmuxer').VideoSegmentStream, - CaptionParser: require('./caption-parser') + CaptionParser: require('./caption-parser'), + WebVttParser: require('./webvtt-parser') }; diff --git a/lib/mp4/samples.js b/lib/mp4/samples.js new file mode 100644 index 00000000..318fcdd1 --- /dev/null +++ b/lib/mp4/samples.js @@ -0,0 +1,87 @@ +const { parseTrun } = require("../tools/mp4-inspector"); +const { findBox } = require("./probe"); +var window = require('global/window'); + +/** + * Utility function for parsing data from mdat boxes. + * @param {Array} segment the segment data to create mdat/traf pairs from. + * @returns mdat and traf boxes paired up for easier parsing. + */ +var getMdatTrafPairs = function(segment) { + var trafs = findBox(segment, ['moof', 'traf']); + var mdats = findBox(segment, ['mdat']); + + var mdatTrafPairs = []; + + // Pair up each traf with a mdat as moofs and mdats are in pairs + mdats.forEach(function(mdat, index) { + var matchingTraf = trafs[index]; + mdatTrafPairs.push({ + mdat: mdat, + traf: matchingTraf + }); + }); + + return mdatTrafPairs; +}; + +/** + * Parses sample information out of Track Run Boxes and calculates + * the absolute presentation and decode timestamps of each sample. + * + * @param {Array} truns - The Trun Run boxes to be parsed + * @param {Number|BigInt} baseMediaDecodeTime - base media decode time from tfdt + @see ISO-BMFF-12/2015, Section 8.8.12 + * @param {Object} tfhd - The parsed Track Fragment Header + * @see inspect.parseTfhd + * @return {Object[]} the parsed samples + * + * @see ISO-BMFF-12/2015, Section 8.8.8 + **/ +var parseSamples = function(truns, baseMediaDecodeTime, tfhd) { + var currentDts = baseMediaDecodeTime; + var defaultSampleDuration = tfhd.defaultSampleDuration || 0; + var defaultSampleSize = tfhd.defaultSampleSize || 0; + var trackId = tfhd.trackId; + var allSamples = []; + + truns.forEach(function(trun) { + // Note: We currently do not parse the sample table as well + // as the trun. It's possible some sources will require this. + // moov > trak > mdia > minf > stbl + var trackRun = parseTrun(trun); + var samples = trackRun.samples; + + samples.forEach(function(sample) { + if (sample.duration === undefined) { + sample.duration = defaultSampleDuration; + } + if (sample.size === undefined) { + sample.size = defaultSampleSize; + } + sample.trackId = trackId; + sample.dts = currentDts; + if (sample.compositionTimeOffset === undefined) { + sample.compositionTimeOffset = 0; + } + + if (typeof currentDts === 'bigint') { + sample.pts = currentDts + window.BigInt(sample.compositionTimeOffset); + currentDts += window.BigInt(sample.duration); + + } else { + sample.pts = currentDts + sample.compositionTimeOffset; + currentDts += sample.duration; + } + }); + + allSamples = allSamples.concat(samples); + }); + + return allSamples; +}; + +module.exports = { + getMdatTrafPairs, + parseSamples +}; diff --git a/lib/mp4/webvtt-parser.js b/lib/mp4/webvtt-parser.js new file mode 100644 index 00000000..3dbb7ad6 --- /dev/null +++ b/lib/mp4/webvtt-parser.js @@ -0,0 +1,126 @@ +const { parseTfdt } = require("../tools/mp4-inspector"); +const findBox = require("./find-box"); +const { getTimescaleFromMediaHeader } = require("./probe"); +const { parseSamples, getMdatTrafPairs } = require("./samples"); + +/** + * Module for parsing WebVTT text and styles from FMP4 segments. + * Based on the ISO/IEC 14496-30. + */ +const WebVttParser = function() { + // default timescale to 90k + let timescale = 90e3; + + /** + * Parses the timescale from the init segment. + * @param {Array} segment The initialization segment to parse the timescale from. + */ + this.init = function(segment) { + // We just need the timescale from the init segment. + const mdhd = findBox(segment, ['moov', 'trak', 'mdia', 'mdhd'])[0]; + + if (mdhd) { + timescale = getTimescaleFromMediaHeader(mdhd); + } + }; + + /** + * Parses a WebVTT FMP4 segment. + * @param {Array} segment The content segment to parse the WebVTT cues from. + * @returns The WebVTT cue text, styling, and timing info as an array of cue objects. + */ + this.parseSegment = function(segment) { + const vttCues = []; + const mdatTrafPairs = getMdatTrafPairs(segment); + let baseMediaDecodeTime = 0; + + mdatTrafPairs.forEach(function(pair) { + const mdatBox = pair.mdat; + const trafBox = pair.traf; + // zero or one. + const tfdtBox = findBox(trafBox, ['tfdt'])[0]; + // zero or one. + const tfhdBox = findBox(trafBox, ['tfhd'])[0]; + // zero or more. + const trunBoxes = findBox(trafBox, ['trun']); + + if (tfdtBox) { + const tfdt = parseTfdt(tfdtBox); + + baseMediaDecodeTime = tfdt.baseMediaDecodeTime; + } + + if (trunBoxes.length && tfhdBox) { + const samples = parseSamples(trunBoxes, baseMediaDecodeTime, tfhdBox); + let mdatOffset = 0; + + samples.forEach(function(sample) { + // decode utf8 payload + const UTF_8 = 'utf-8'; + const textDecoder = new TextDecoder(UTF_8); + // extract sample data from the mdat box. + // WebVTT Sample format: + // Exactly one VTTEmptyCueBox box + // OR one or more VTTCueBox boxes. + const sampleData = mdatBox.slice(mdatOffset, mdatOffset + sample.size); + // single vtte box. + const vtteBox = findBox(sampleData, ['vtte'])[0]; + + // empty box + if (vtteBox) { + mdatOffset += sample.size; + return; + } + + // TODO: Support 'vtta' boxes. + // VTTAdditionalTextBoxes can be interleaved between VTTCueBoxes. + + const vttcBoxes = findBox(sampleData, ['vttc']); + + vttcBoxes.forEach(function(vttcBox) { + // mandatory payload box. + const paylBox = findBox(vttcBox, ['payl'])[0]; + // optional settings box + const sttgBox = findBox(vttcBox, ['sttg'])[0]; + const start = sample.pts / timescale; + const end = (sample.pts + sample.duration) / timescale; + let cueText, settings; + + // contains cue text. + if (paylBox) { + try { + cueText = textDecoder.decode(paylBox); + } catch(e) { + console.error(e); + } + } + + // settings box contains styling. + if (sttgBox) { + try { + settings = textDecoder.decode(sttgBox); + } catch(e) { + console.error(e); + } + } + + if (sample.duration && cueText) { + vttCues.push({ + cueText, + start, + end, + settings + }); + } + }); + + mdatOffset += sample.size; + }); + } + }); + + return vttCues; + }; +}; + +module.exports = WebVttParser; diff --git a/test/segments/test-webvtt-init.mp4 b/test/segments/test-webvtt-init.mp4 new file mode 100644 index 00000000..ae6716f5 Binary files /dev/null and b/test/segments/test-webvtt-init.mp4 differ diff --git a/test/segments/test-webvtt.m4s b/test/segments/test-webvtt.m4s new file mode 100644 index 00000000..47c73946 Binary files /dev/null and b/test/segments/test-webvtt.m4s differ diff --git a/test/webvtt-parser.test.js b/test/webvtt-parser.test.js new file mode 100644 index 00000000..4cbc3ca5 --- /dev/null +++ b/test/webvtt-parser.test.js @@ -0,0 +1,39 @@ +var segments = require('data-files!segments'); +var vttContentSegment = segments['test-webvtt.m4s'](); +var vttInitSegment = segments['test-webvtt-init.mp4'](); +var WebVttParser = require('../lib/mp4').WebVttParser; +var window = require('global/window'); +var webVttParser; + +QUnit.module('MP4 WebVtt Segment Parser', { + beforeEach: function() { + webVttParser = new WebVttParser(); + } +}); + +QUnit.test('parse webvtt init and content segments', function(assert) { + // Init segment sets the timescale. + webVttParser.init(vttInitSegment); + assert.ok(webVttParser, 'WebVtt parser created'); + // we need a TextDecoder to test the WebVTT segment parser. + if (window.TextDecoder) { + const parsedWebVttCues = webVttParser.parseSegment(vttContentSegment); + const expectedCueValues = [ + { + cueText: "2024-09-19T20:13:06Z\nen # 863388393", + start: 1726776786, + end: 1726776786.9, + settings: undefined + }, + { + cueText: "2024-09-19T20:13:07Z\nen # 863388393", + start: 1726776787, + end: 1726776787.9, + settings: undefined + } + ]; + assert.ok(parsedWebVttCues, 'parsed WebVtt Cues are created'); + assert.equal(parsedWebVttCues.length, 2, '2 WebVtt Cues are created'); + assert.deepEqual(parsedWebVttCues, expectedCueValues, 'WebVtt cues are expected values'); + } +});