From bc9266a900a7d2651a72fe2ebaebf598c82ebb49 Mon Sep 17 00:00:00 2001 From: Joshua Pohl Date: Wed, 29 May 2024 14:58:43 -0600 Subject: [PATCH] feat: support fetching avaiable episode transcripts --- README.md | 66 +++++++++++++++++---------------- bin/bin.js | 4 ++ bin/commander.js | 35 +++++++++++++++++- bin/util.js | 95 +++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 157 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 38f2765..8b16ad8 100644 --- a/README.md +++ b/README.md @@ -24,38 +24,40 @@ Either `--url` or `--file` must be provided. Type values surrounded in square brackets (`[]`) can be used as used as boolean options (no argument required). -| Option | Type | Required | Description | -| ------------------------ | ------------------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| --url | String | true\* | URL to podcast RSS feed. | -| --file | String | true\* | Path to local RSS file. | -| --out-dir | String | false | Specify output directory for episodes and metadata. Defaults to "./{{podcast_title}}". See "Template Options" for more details. | -| --threads | Number | false | Determines the number of downloads that will happen concurrently. Default is 1. | -| --attempts | Number | false | Sets the number of download attempts per individual file. Default is 3. | -| --archive | [String] | false | Download or write out items not listed in archive file. Generates archive file at path if not found. Defaults to "./{{podcast_title}}/archive.json" when used as a boolean option. See "Template Options" for more details. | -| --episode-template | String | false | Template for generating episode related filenames. See "Template Options" for details. | -| --include-meta | | false | Write out podcast metadata to JSON. | -| --include-episode-meta | | false | Write out individual episode metadata to JSON. | -| --include-episode-images | | false | Download found episode images. | -| --offset | Number | false | Offset starting download position. Default is 0. | -| --limit | Number | false | Max number of episodes to download. Downloads all by default. | -| --after | String | false | Only download episodes after this date (i.e. MM/DD/YYY, inclusive). | -| --before | String | false | Only download episodes before this date (i.e. MM/DD/YYY, inclusive) | -| --episode-regex | String | false | Match episode title against provided regex before starting download. | -| --episode-digits | Number | false | Minimum number of digits to use for episode numbering (e.g. 3 would generate "001" instead of "1"). Default is 0. | -| --episode-num-offset | Number | false | Offset the acquired episode number. Default is 0. | -| --episode-source-order | String | false | Attempted order to extract episode audio URL from RSS feed. Default is "enclosure,link". | -| --add-mp3-metadata | | false | Attempts to add a base level of episode metadata to each episode. Recommended only in cases where the original metadata is of poor quality. (**ffmpeg required**) | -| --adjust-bitrate | String (e.g. "48k") | false | Attempts to adjust bitrate of episodes. (**ffmpeg required**) | -| --mono | | false | Attempts to force episodes into mono. (**ffmpeg required**) | -| --override | | false | Override local files on collision. | -| --always-postprocess | | false | Always run additional tasks on the file regardless if the file already exists. This includes --add-mp3-metadata, --adjust-bitrate, --mono, and --exec. | -| --reverse | | false | Reverse download direction and start at last RSS item. | -| --info | | false | Print retrieved podcast info instead of downloading. | -| --list | [String] | false | Print episode list instead of downloading. Defaults to "table" when used as a boolean option. "json" is also supported. | -| --exec | String | false | Execute a command after each episode is downloaded. See "Template Options" for more details. | -| --parser-config | String | false | Path to JSON file that will be parsed and used to override the default config passed to [rss-parser](https://github.com/rbren/rss-parser#xml-options). | -| --proxy | | false | Enable proxy support. Specify environment variables listed by [global-agent](https://github.com/gajus/global-agent#environment-variables). | -| --help | | false | Output usage information. | +| Option | Type | Required | Description | +| ----------------------------- | ------------------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| --url | String | true\* | URL to podcast RSS feed. | +| --file | String | true\* | Path to local RSS file. | +| --out-dir | String | false | Specify output directory for episodes and metadata. Defaults to "./{{podcast_title}}". See "Template Options" for more details. | +| --threads | Number | false | Determines the number of downloads that will happen concurrently. Default is 1. | +| --attempts | Number | false | Sets the number of download attempts per individual file. Default is 3. | +| --archive | [String] | false | Download or write out items not listed in archive file. Generates archive file at path if not found. Defaults to "./{{podcast_title}}/archive.json" when used as a boolean option. See "Template Options" for more details. | +| --episode-template | String | false | Template for generating episode related filenames. See "Template Options" for details. | +| --include-meta | | false | Write out podcast metadata to JSON. | +| --include-episode-meta | | false | Write out individual episode metadata **to** JSON. | +| --include-episode-images | | false | Download found episode images. | +| --include-episode-transcripts | | false | download found episode transcripts. | +| --offset | Number | false | Offset starting download position. Default is 0. | +| --limit | Number | false | Max number of episodes to download. Downloads all by default. | +| --after | String | false | Only download episodes after this date (i.e. MM/DD/YYY, inclusive). | +| --before | String | false | Only download episodes before this date (i.e. MM/DD/YYY, inclusive) | +| --episode-regex | String | false | Match episode title against provided regex before starting download. | +| --episode-digits | Number | false | Minimum number of digits to use for episode numbering (e.g. 3 would generate "001" instead of "1"). Default is 0. | +| --episode-num-offset | Number | false | Offset the acquired episode number. Default is 0. | +| --episode-source-order | String | false | Attempted order to extract episode audio URL from RSS feed. Default is "enclosure,link". | +| --episode-transcript-types | String | false | List of allowed transcript types in preferred order. Default is "application/json,application/x-subrip,application/srr,application/srt,text/vtt,text/html,text/plain". | +| --add-mp3-metadata | | false | Attempts to add a base level of episode metadata to each episode. Recommended only in cases where the original metadata is of poor quality. (**ffmpeg required**) | +| --adjust-bitrate | String (e.g. "48k") | false | Attempts to adjust bitrate of episodes. (**ffmpeg required**) | +| --mono | | false | Attempts to force episodes into mono. (**ffmpeg required**) | +| --override | | false | Override local files on collision. | +| --always-postprocess | | false | Always run additional tasks on the file regardless if the file already exists. This includes --add-mp3-metadata, --adjust-bitrate, --mono, and --exec. | +| --reverse | | false | Reverse download direction and start at last RSS item. | +| --info | | false | Print retrieved podcast info instead of downloading. | +| --list | [String] | false | Print episode list instead of downloading. Defaults to "table" when used as a boolean option. "json" is also supported. | +| --exec | String | false | Execute a command after each episode is downloaded. See "Template Options" for more details. | +| --parser-config | String | false | Path to JSON file that will be parsed and used to override the default config passed to [rss-parser](https://github.com/rbren/rss-parser#xml-options). | +| --proxy | | false | Enable proxy support. Specify environment variables listed by [global-agent](https://github.com/gajus/global-agent#environment-variables). | +| --help | | false | Output usage information. | ## Archive diff --git a/bin/bin.js b/bin/bin.js index 21e30c8..0299c5a 100755 --- a/bin/bin.js +++ b/bin/bin.js @@ -41,10 +41,12 @@ const { episodeRegex, episodeSourceOrder, episodeTemplate, + episodeTranscriptTypes, exec, file, includeEpisodeImages, includeEpisodeMeta, + includeEpisodeTranscripts, includeMeta, info, limit, @@ -212,6 +214,8 @@ const main = async () => { episodeSourceOrder, episodeTemplate, includeEpisodeImages, + includeEpisodeTranscripts, + episodeTranscriptTypes, }); if (!targetItems.length) { diff --git a/bin/commander.js b/bin/commander.js index c33368b..bb2a574 100644 --- a/bin/commander.js +++ b/bin/commander.js @@ -1,4 +1,8 @@ -import { AUDIO_ORDER_TYPES, ITEM_LIST_FORMATS } from "./util.js"; +import { + AUDIO_ORDER_TYPES, + ITEM_LIST_FORMATS, + TRANSCRIPT_TYPES, +} from "./util.js"; import { createParseNumber, hasFfmpeg } from "./validate.js"; import { logErrorAndExit } from "./logger.js"; @@ -58,6 +62,35 @@ export const setupCommander = (commander, argv) => { "--include-episode-meta", "write out individual episode metadata to json" ) + .option( + "--include-episode-transcripts", + "download found episode transcripts" + ) + .option( + "--episode-transcript-types ", + "list of allowed transcript types in preferred order", + (value) => { + const parsed = value.split(",").map((type) => type.trim()); + const isValid = parsed.every((type) => !!TRANSCRIPT_TYPES[type]); + + if (!isValid) { + logErrorAndExit( + `Invalid type found in --transcript-types: ${value}\n` + ); + } + + return parsed; + }, + [ + TRANSCRIPT_TYPES["application/json"], + TRANSCRIPT_TYPES["application/x-subrip"], + TRANSCRIPT_TYPES["application/srr"], + TRANSCRIPT_TYPES["application/srt"], + TRANSCRIPT_TYPES["text/vtt"], + TRANSCRIPT_TYPES["text/html"], + TRANSCRIPT_TYPES["text/plain"], + ] + ) .option("--include-episode-images", "download found episode images") .option( "--offset ", diff --git a/bin/util.js b/bin/util.js index cd694df..c876e6a 100644 --- a/bin/util.js +++ b/bin/util.js @@ -11,6 +11,13 @@ import { getArchiveFilename, getItemFilename } from "./naming.js"; const execWithPromise = util.promisify(exec); const isWin = process.platform === "win32"; +const defaultRssParserConfig = { + defaultRSS: 2.0, + customFields: { + item: [["podcast:transcript", "podcastTranscripts", { keepArray: true }]], + }, +}; + /* Escape arguments for a shell command used with exec. Borrowed from shell-escape: https://github.com/xxorax/node-shell-escape/ @@ -159,6 +166,8 @@ const getItemsToDownload = ({ episodeSourceOrder, episodeTemplate, includeEpisodeImages, + includeEpisodeTranscripts, + episodeTranscriptTypes, }) => { const { startIndex, shouldGo, next } = getLoopControls({ offset, @@ -256,6 +265,46 @@ const getItemsToDownload = ({ } } + if (includeEpisodeTranscripts) { + const episodeTranscriptUrl = getTranscriptUrl( + item, + episodeTranscriptTypes + ); + + if (episodeTranscriptUrl) { + const episodeTranscriptFileExt = getUrlExt(episodeTranscriptUrl); + const episodeTranscriptArchiveKey = getArchiveKey({ + prefix: archiveUrl, + name: getArchiveFilename({ + pubDate, + name: title, + ext: episodeTranscriptFileExt, + }), + }); + + const episodeTranscriptName = getItemFilename({ + item, + feed, + url: episodeAudioUrl, + ext: episodeTranscriptFileExt, + template: episodeTemplate, + width: episodeDigits, + offset: episodeNumOffset, + }); + + const outputTranscriptPath = path.resolve( + basePath, + episodeTranscriptName + ); + + item._extra_downloads.push({ + url: episodeTranscriptUrl, + outputPath: outputTranscriptPath, + key: episodeTranscriptArchiveKey, + }); + } + } + items.push(item); } @@ -474,12 +523,40 @@ const getImageUrl = ({ image, itunes }) => { return null; }; -const getFileFeed = async (filePath, parserConfig) => { - const defaultConfig = { - defaultRSS: 2.0, - }; +export const TRANSCRIPT_TYPES = { + "application/json": "application/json", + "application/srr": "application/srr", + "application/srt": "application/srt", + "application/x-subrip": "application/x-subrip", + "text/html": "text/html", + "text/plain": "text/plain", + "text/vtt": "text/vtt", +}; + +// @see https://github.com/Podcastindex-org/podcast-namespace/blob/main/docs/1.0.md#transcript +const getTranscriptUrl = (item, transcriptTypes = []) => { + if (!item.podcastTranscripts?.length) { + return null; + } + + for (const transcriptType of transcriptTypes) { + const matchingTranscriptType = item.podcastTranscripts.find( + (transcript) => + !!transcript?.["$"]?.url && transcript?.["$"]?.type === transcriptType + ); + + if (matchingTranscriptType) { + return matchingTranscriptType?.["$"]?.url; + } + } + + return null; +}; - const config = parserConfig ? getJsonFile(parserConfig) : defaultConfig; +const getFileFeed = async (filePath, parserConfig) => { + const config = parserConfig + ? getJsonFile(parserConfig) + : defaultRssParserConfig; const rssString = getFileString(filePath); if (parserConfig && !config) { @@ -499,11 +576,9 @@ const getFileFeed = async (filePath, parserConfig) => { }; const getUrlFeed = async (url, parserConfig) => { - const defaultConfig = { - defaultRSS: 2.0, - }; - - const config = parserConfig ? getJsonFile(parserConfig) : defaultConfig; + const config = parserConfig + ? getJsonFile(parserConfig) + : defaultRssParserConfig; if (parserConfig && !config) { logErrorAndExit(`Unable to load parser config: ${parserConfig}`);