Merge pull request #4356 from remotion-dev/openai-whisper

remotion-dev · Oct 4, 2024 · dac638c · dac638c
2 parents 687c235 + 244c55d
commit dac638c
Show file tree

Hide file tree

Showing 29 changed files with 942 additions and 33 deletions.
diff --git a/packages/STATS.md b/packages/STATS.md
@@ -30,6 +30,7 @@ Monthly downloads of Remotion packages
 [![NPM Downloads](https://img.shields.io/npm/dm/@remotion/media-utils.svg?style=flat&color=black&label=@remotion/media-utils)](https://npmcharts.com/compare/@remotion/media-utils?minimal=true)  
 [![NPM Downloads](https://img.shields.io/npm/dm/@remotion/motion-blur.svg?style=flat&color=black&label=@remotion/motion-blur)](https://npmcharts.com/compare/@remotion/motion-blur?minimal=true)  
 [![NPM Downloads](https://img.shields.io/npm/dm/@remotion/noise.svg?style=flat&color=black&label=@remotion/noise)](https://npmcharts.com/compare/@remotion/noise?minimal=true)  
+[![NPM Downloads](https://img.shields.io/npm/dm/@remotion/openai-whisper.svg?style=flat&color=black&label=@remotion/openai-whisper)](https://npmcharts.com/compare/@remotion/openai-whisper?minimal=true)  
 [![NPM Downloads](https://img.shields.io/npm/dm/@remotion/paths.svg?style=flat&color=black&label=@remotion/paths)](https://npmcharts.com/compare/@remotion/paths?minimal=true)  
 [![NPM Downloads](https://img.shields.io/npm/dm/@remotion/player.svg?style=flat&color=black&label=@remotion/player)](https://npmcharts.com/compare/@remotion/player?minimal=true)  
 [![NPM Downloads](https://img.shields.io/npm/dm/@remotion/preload.svg?style=flat&color=black&label=@remotion/preload)](https://npmcharts.com/compare/@remotion/preload?minimal=true)  

diff --git a/packages/ai-improvements/package.json b/packages/ai-improvements/package.json
@@ -4,7 +4,7 @@
 	},
 	"name": "ai-improvements",
 	"dependencies": {
-		"openai": "^4.18.0"
+		"openai": "4.67.1"
 	},
 	"private": true,
 	"version": "4.0.216"

diff --git a/packages/cli/src/list-of-remotion-packages.ts b/packages/cli/src/list-of-remotion-packages.ts
@@ -31,6 +31,7 @@ export const listOfRemotionPackages = [
 	'@remotion/tailwind',
 	'@remotion/transitions',
 	'@remotion/install-whisper-cpp',
+	'@remotion/openai-whisper',
 	'@remotion/captions',
 	'@remotion/animation-utils',
 	'@remotion/animated-emoji',

diff --git a/packages/create-video/src/list-of-remotion-packages.ts b/packages/create-video/src/list-of-remotion-packages.ts
@@ -31,6 +31,7 @@ export const listOfRemotionPackages = [
 	'@remotion/tailwind',
 	'@remotion/transitions',
 	'@remotion/install-whisper-cpp',
+	'@remotion/openai-whisper',
 	'@remotion/captions',
 	'@remotion/animation-utils',
 	'@remotion/animated-emoji',

diff --git a/packages/docs/components/TableOfContents/api.tsx b/packages/docs/components/TableOfContents/api.tsx
@@ -5,6 +5,7 @@ import {TableOfContents as EnableScssTableOfContents} from '../../docs/enable-sc
 import {TableOfContents as FontsTableOfContents} from '../../docs/fonts-api/TableOfContents';
 import {TableOfContents as InstallWhisperCppTableOfContents} from '../../docs/install-whisper-cpp/install-whisper-cpp';
 import {TableOfContents as MediaParserTableOfContents} from '../../docs/media-parser/TableOfContents';
+import {TableOfContents as OpenAiWhisperTableOfContents} from '../../docs/openai-whisper/TableOfContents';
 import {TableOfContents as SkiaTableOfContents} from '../../docs/skia/TableOfContents';
 import {TableOfContents as StudioTableOfContents} from '../../docs/studio/TableOfContents';
 import {TableOfContents as TailwindTableOfContents} from '../../docs/tailwind/TableOfContents';
@@ -121,6 +122,9 @@ export const TableOfContents: React.FC = () => {
 			<h2>@remotion/install-whisper-cpp</h2>
 			<p>Whisper.cpp installation and transcription</p>
 			<InstallWhisperCppTableOfContents />
+			<h2>@remotion/openai-whisper</h2>
+			<p>Work with transcriptions from OpenAI Whisper</p>
+			<OpenAiWhisperTableOfContents />
 			<h2>@remotion/animated-emoji</h2>
 			<p>Google Fonts Animated Emojis as Remotion Components</p>
 			<AnimatedEmojiTableOfContents />

diff --git a/packages/docs/docs/captions/caption.mdx b/packages/docs/docs/captions/caption.mdx
@@ -9,16 +9,16 @@ crumb: '@remotion/captions'
 This is a simple data structure for a caption.
 
 ```tsx twoslash
-import {Caption} from '@remotion/captions';
-//      ^?
+import type {Caption} from '@remotion/captions';
+//            ^?
 ```
 
 By establishing a standard data structure, we allow many operations that involve captions to be interoperable:
 
-- Transcribing (using the [`@remotion/install-whisper-cpp`](/docs/install-whisper-cpp) package)
-- Formatting (for example, creating pages using [`createTikTokStyleCaptions()`](/docs/captions/create-tiktok-style-captions))
-- Parsing (using the [`parseSrt()`](/docs/captions/parse-srt) function)
-- Serializing (for example to a .srt file)
+- **Transcribing**: Using the [`@remotion/install-whisper-cpp`](/docs/install-whisper-cpp) or [`@remotion/openai-whisper`](/docs/openai-whisper) packages
+- **Formatting**: For example, creating pages using [`createTikTokStyleCaptions()`](/docs/captions/create-tiktok-style-captions)
+- **Parsing**: Using the [`parseSrt()`](/docs/captions/parse-srt) function
+- **Serializing**: For example to a `.srt` file using [`serializeSrt()`](/docs/captions/serialize-srt)
 
 ## Fields
 

diff --git a/packages/docs/docs/openai-whisper/TableOfContents.tsx b/packages/docs/docs/openai-whisper/TableOfContents.tsx
@@ -0,0 +1,19 @@
+import React from 'react';
+import {Grid} from '../../components/TableOfContents/Grid';
+import {TOCItem} from '../../components/TableOfContents/TOCItem';
+
+export const TableOfContents: React.FC = () => {
+	return (
+		<div>
+			<Grid>
+				<TOCItem link="/docs/openai-whisper/openai-whisper-api-to-captions">
+					<strong>{'openAiWhisperApiToCaptions()'}</strong>
+					<div>
+						Turn OpenAI Whisper API transcriptions into an array of{' '}
+						<code>Caption</code>
+					</div>
+				</TOCItem>
+			</Grid>
+		</div>
+	);
+};
diff --git a/packages/docs/docs/openai-whisper/index.mdx b/packages/docs/docs/openai-whisper/index.mdx
@@ -0,0 +1,28 @@
+---
+image: /generated/articles-docs-openai-whisper-index.png
+title: '@remotion/openai-whisper'
+crumb: 'Subtitle videos'
+---
+
+_Available from v4.0.217_
+
+The `@remotion/openai-whisper` package provides utilities for working with the OpenAI Whisper API.
+
+It provides a transformation function for converting the output of the OpenAI Whisper API into an array of [`Caption`](/docs/captions/caption) objects in order to make the captions compatible with other caption-related Remotion APIs.
+
+import {TableOfContents} from './TableOfContents';
+
+<Installation pkg="@remotion/openai-whisper" />
+
+## APIs
+
+<TableOfContents />
+
+## License
+
+MIT
+
+## See also
+
+- [`@remotion/captions`](/docs/captions)
+- [`@remotion/install-whisper-cpp`](/docs/install-whisper-cpp)
diff --git a/packages/docs/docs/openai-whisper/openai-whisper-api-to-captions.mdx b/packages/docs/docs/openai-whisper/openai-whisper-api-to-captions.mdx
@@ -0,0 +1,41 @@
+---
+image: /generated/articles-docs-openai-whisper-openai-whisper-api-to-captions.png
+title: openAiWhisperApiToCaptions()
+crumb: '@remotion/openai-whisper'
+---
+
+# openAiWhisperApiToCaptions()<AvailableFrom v="4.0.217"/>
+
+Turns the output from [`openai.audio.transcriptions.create`](https://platform.openai.com/docs/guides/speech-to-text/transcriptions) from the [openai](https://npm.im/openai) package into an array of [`Caption`](/docs/captions/caption) objects.
+
+This package performs processing on the captions in order to retain the punctuation in the words, which is not by default included in the OpenAI response.
+
+This function can be used in any JavaScript environment, but you should not use the OpenAI API in the browser because your API key will be exposed to the browser.
+
+```tsx twoslash title="Example usage"
+// @module: ESNext
+// @target: ESNext
+
+import fs from 'fs';
+import {OpenAI} from 'openai';
+import {openAiWhisperApiToCaptions} from '@remotion/openai-whisper';
+
+const openai = new OpenAI();
+
+const transcription = await openai.audio.transcriptions.create({
+  file: fs.createReadStream('audio.mp3'),
+  model: 'whisper-1',
+  response_format: 'verbose_json',
+  prompt: 'Hello, welcome to my lecture.',
+  timestamp_granularities: ['word'],
+});
+
+const {captions} = openAiWhisperApiToCaptions({transcription});
+```
+
+## See also
+
+- [Source code for this function](https://github.com/remotion-dev/remotion/blob/main/packages/openai-whisper/src/openai-whisper-api-to-captions.ts)
+- [`@remotion/openai-whisper`](/docs/openai-whisper)
+- [`@remotion/install-whisper-cpp`](/docs/install-whisper-cpp)
+- [`@remotion/captions`](/docs/captions)
diff --git a/packages/docs/package.json b/packages/docs/package.json
@@ -51,6 +51,7 @@
 		"@remotion/noise": "workspace:*",
 		"@remotion/paths": "workspace:*",
 		"@remotion/player": "workspace:*",
+		"@remotion/openai-whisper": "workspace:*",
 		"@remotion/preload": "workspace:*",
 		"@remotion/renderer": "workspace:*",
 		"@remotion/rive": "workspace:*",
@@ -94,6 +95,7 @@
 		"@types/unist": "^2.0.0",
 		"@types/node": "20.12.14",
 		"lodash.sortby": "^4.7.0",
-		"url-loader": "^4.1.1"
+		"url-loader": "^4.1.1",
+		"openai": "4.67.1"
 	}
 }
diff --git a/packages/docs/sidebars.js b/packages/docs/sidebars.js
@@ -596,6 +596,15 @@ module.exports = {
 				'install-whisper-cpp/convert-to-captions',
 			],
 		},
+		{
+			type: 'category',
+			label: '@remotion/openai-whisper',
+			link: {
+				type: 'doc',
+				id: 'openai-whisper/index',
+			},
+			items: ['openai-whisper/openai-whisper-api-to-captions'],
+		},
 		{
 			type: 'category',
 			label: '@remotion/fonts',

diff --git a/packages/docs/src/data/articles.ts b/packages/docs/src/data/articles.ts
@@ -398,6 +398,13 @@ export const articles = [
 		compId: 'articles-docs-renderer-render-frames',
 		crumb: '@remotion/renderer',
 	},
+	{
+		id: 'metadata',
+		title: 'Setting video metadata',
+		relativePath: 'docs/metadata.mdx',
+		compId: 'articles-docs-metadata',
+		crumb: 'Techniques',
+	},
 	{
 		id: 'bundle',
 		title: 'bundle()',
@@ -790,6 +797,20 @@ export const articles = [
 		compId: 'articles-docs-get-image-dimensions',
 		crumb: '@remotion/media-utils',
 	},
+	{
+		id: 'openai-whisper/index',
+		title: '@remotion/openai-whisper',
+		relativePath: 'docs/openai-whisper/index.mdx',
+		compId: 'articles-docs-openai-whisper-index',
+		crumb: 'Subtitle videos',
+	},
+	{
+		id: 'openai-whisper/openai-whisper-api-to-captions',
+		title: 'openAiWhisperApiToCaptions()',
+		relativePath: 'docs/openai-whisper/openai-whisper-api-to-captions.mdx',
+		compId: 'articles-docs-openai-whisper-openai-whisper-api-to-captions',
+		crumb: '@remotion/openai-whisper',
+	},
 	{
 		id: 'cancel-render',
 		title: 'cancelRender()',
@@ -1633,13 +1654,6 @@ export const articles = [
 		compId: 'articles-docs-miscellaneous-chrome-headless-shell',
 		crumb: 'FAQ',
 	},
-	{
-		id: 'miscellaneous/render-in-browser',
-		title: 'Can I render videos in the browser?',
-		relativePath: 'docs/miscellaneous/render-in-browser.mdx',
-		compId: 'articles-docs-miscellaneous-render-in-browser',
-		crumb: 'Techniques',
-	},
 	{
 		id: 'miscellaneous/render-in-browser',
 		title: 'Can I render videos in the browser?',

diff --git a/packages/docs/static/generated/articles-docs-openai-whisper-index.png b/packages/docs/static/generated/articles-docs-openai-whisper-index.png
diff --git a/...tatic/generated/articles-docs-openai-whisper-openai-whisper-api-to-captions.png b/...tatic/generated/articles-docs-openai-whisper-openai-whisper-api-to-captions.png
diff --git a/packages/lambda-php/.phpunit.result.cache b/packages/lambda-php/.phpunit.result.cache
@@ -1 +1 @@
-{"version":1,"defects":{"Remotion\\LambdaPhp\\Tests\\PHPClientTest::testClient":5},"times":{"Remotion\\LambdaPhp\\Tests\\PHPClientTest::testClient":0.037,"Remotion\\LambdaPhp\\Tests\\PHPRenderProgressTest::testClient":0.017}}
+{"version":1,"defects":{"Remotion\\LambdaPhp\\Tests\\PHPClientTest::testClient":5},"times":{"Remotion\\LambdaPhp\\Tests\\PHPClientTest::testClient":0.037,"Remotion\\LambdaPhp\\Tests\\PHPRenderProgressTest::testClient":0.02}}
diff --git a/packages/openai-whisper/.eslintrc b/packages/openai-whisper/.eslintrc
@@ -0,0 +1,3 @@
+{
+	"extends": "@jonny",
+}
diff --git a/packages/openai-whisper/README.md b/packages/openai-whisper/README.md
@@ -0,0 +1,18 @@
+# @remotion/openai-whisper
+
+Work with the output of the OpenAI Whisper API
+
+[![NPM Downloads](https://img.shields.io/npm/dm/@remotion/openai-whisper.svg?style=flat&color=black&label=Downloads)](https://npmcharts.com/compare/@remotion/openai-whisper?minimal=true)
+
+## Installation
+
+```bash
+npm install @remotion/openai-whisper --save-exact
+```
+
+When installing a Remotion package, make sure to align the version of all `remotion` and `@remotion/*` packages to the same version.
+Remove the `^` character from the version number to use the exact version.
+
+## Usage
+
+See the [documentation](https://www.remotion.dev/docs/openai-whisper) for more information.
diff --git a/packages/openai-whisper/package.json b/packages/openai-whisper/package.json
@@ -0,0 +1,37 @@
+{
+	"repository": {
+		"url": "https://github.com/remotion-dev/remotion/tree/main/packages/openai-whisper"
+	},
+	"name": "@remotion/openai-whisper",
+	"version": "4.0.216",
+	"description": "Work with the output of the OpenAI Whisper API",
+	"main": "dist/index.js",
+	"sideEffects": false,
+	"bugs": {
+		"url": "https://github.com/remotion-dev/remotion/issues"
+	},
+	"scripts": {
+		"formatting": "prettier src --check",
+		"lint": "eslint src --ext ts,tsx",
+		"test": "bun test src"
+	},
+	"files": [
+		"dist"
+	],
+	"author": "Jonny Burger <[email protected]>",
+	"license": "MIT",
+	"dependencies": {
+		"@remotion/captions": "workspace:*"
+	},
+	"peerDependencies": {},
+	"devDependencies": {
+		"openai": "4.67.1"
+	},
+	"keywords": [
+		"remotion"
+	],
+	"publishConfig": {
+		"access": "public"
+	},
+	"homepage": "https://www.remotion.dev/docs/openai-whisper"
+}
diff --git a/packages/openai-whisper/src/index.ts b/packages/openai-whisper/src/index.ts
@@ -0,0 +1,7 @@
+export {
+	OpenAiToCaptionsInput,
+	OpenAiToCaptionsOutput,
+	openAiWhisperApiToCaptions,
+} from './openai-whisper-api-to-captions';
+
+export {OpenAiVerboseTranscription} from './openai-format';
diff --git a/packages/openai-whisper/src/openai-format.ts b/packages/openai-whisper/src/openai-format.ts
@@ -0,0 +1,27 @@
+export interface TranscriptionWord {
+	end: number;
+	start: number;
+	word: string;
+}
+
+export interface TranscriptionSegment {
+	id: number;
+	avg_logprob: number;
+	compression_ratio: number;
+	end: number;
+	no_speech_prob: number;
+	seek: number;
+	start: number;
+	temperature: number;
+	text: string;
+	tokens: Array<number>;
+}
+
+export interface OpenAiVerboseTranscription {
+	duration: number | string;
+	task?: string;
+	language: string;
+	text: string;
+	segments?: Array<TranscriptionSegment>;
+	words?: Array<TranscriptionWord>;
+}
diff --git a/packages/openai-whisper/src/openai-whisper-api-to-captions.ts b/packages/openai-whisper/src/openai-whisper-api-to-captions.ts
@@ -0,0 +1,47 @@
+import type {Caption} from '@remotion/captions';
+import type {OpenAiVerboseTranscription} from './openai-format';
+
+export type OpenAiToCaptionsInput = {
+	transcription: OpenAiVerboseTranscription;
+};
+
+export type OpenAiToCaptionsOutput = {
+	captions: Caption[];
+};
+
+export const openAiWhisperApiToCaptions = ({
+	transcription,
+}: OpenAiToCaptionsInput): OpenAiToCaptionsOutput => {
+	const captions: Caption[] = [];
+	if (!transcription.words) {
+		throw new Error(
+			'The transcription does need to be been generated with `timestamp_granularities: ["word"]`',
+		);
+	}
+
+	let remainingText = transcription.text;
+
+	for (const word of transcription.words) {
+		const match = new RegExp(`^(^.{0,4})${word.word}([\\?,\\.]{0,3})?`).exec(
+			remainingText,
+		);
+		if (!match) {
+			throw new Error(
+				`Unable to parse punctuation from OpenAI Whisper output. Could not find word "${word.word}" in text "${remainingText.slice(0, 100)}". File an issue under https://remotion.dev/issue to ask for a fix.`,
+			);
+		}
+
+		const foundText = match[0];
+		remainingText = remainingText.slice(foundText.length);
+
+		captions.push({
+			confidence: null,
+			endMs: word.end * 1000,
+			startMs: word.start * 1000,
+			text: foundText,
+			timestampMs: ((word.start + word.end) / 2) * 1000,
+		});
+	}
+
+	return {captions};
+};
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"version":1,"defects":{"Remotion\\LambdaPhp\\Tests\\PHPClientTest::testClient":5},"times":{"Remotion\\LambdaPhp\\Tests\\PHPClientTest::testClient":0.037,"Remotion\\LambdaPhp\\Tests\\PHPRenderProgressTest::testClient":0.017}}
		{"version":1,"defects":{"Remotion\\LambdaPhp\\Tests\\PHPClientTest::testClient":5},"times":{"Remotion\\LambdaPhp\\Tests\\PHPClientTest::testClient":0.037,"Remotion\\LambdaPhp\\Tests\\PHPRenderProgressTest::testClient":0.02}}