+ {/* eslint-disable-next-line jsx-a11y/media-has-caption -- captions not (yet) available */}
+
+
+
+ );
+}
+
export default function LinkContentSection({
bookmark,
}: {
@@ -76,6 +90,8 @@ export default function LinkContentSection({
content = ;
} else if (section === "archive") {
content = ;
+ } else if (section === "video") {
+ content = ;
} else {
content = ;
}
@@ -101,6 +117,9 @@ export default function LinkContentSection({
>
Archive
+
+ Video
+
diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts
index ca0f6608..d5bc555e 100644
--- a/apps/workers/crawlerWorker.ts
+++ b/apps/workers/crawlerWorker.ts
@@ -23,9 +23,10 @@ import puppeteer from "puppeteer-extra";
import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";
+import { getBookmarkDetails, updateAsset } from "workerUtils";
import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
-import { db, HoarderDBTransaction } from "@hoarder/db";
+import { db } from "@hoarder/db";
import {
assets,
AssetTypes,
@@ -35,12 +36,12 @@ import {
} from "@hoarder/db/schema";
import {
ASSET_TYPES,
- deleteAsset,
getAssetSize,
IMAGE_ASSET_TYPES,
newAssetId,
saveAsset,
saveAssetFromFile,
+ silentDeleteAsset,
SUPPORTED_UPLOAD_ASSET_TYPES,
} from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
@@ -49,6 +50,7 @@ import {
LinkCrawlerQueue,
OpenAIQueue,
triggerSearchReindex,
+ triggerVideoWorker,
zCrawlLinkRequestSchema,
} from "@hoarder/shared/queues";
import { BookmarkTypes } from "@hoarder/shared/types/bookmarks";
@@ -207,33 +209,6 @@ async function changeBookmarkStatus(
.where(eq(bookmarkLinks.id, bookmarkId));
}
-async function getBookmarkDetails(bookmarkId: string) {
- const bookmark = await db.query.bookmarks.findFirst({
- where: eq(bookmarks.id, bookmarkId),
- with: {
- link: true,
- assets: true,
- },
- });
-
- if (!bookmark || !bookmark.link) {
- throw new Error("The bookmark either doesn't exist or is not a link");
- }
- return {
- url: bookmark.link.url,
- userId: bookmark.userId,
- screenshotAssetId: bookmark.assets.find(
- (a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
- )?.id,
- imageAssetId: bookmark.assets.find(
- (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
- )?.id,
- fullPageArchiveAssetId: bookmark.assets.find(
- (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE,
- )?.id,
- };
-}
-
/**
* This provides some "basic" protection from malicious URLs. However, all of those
* can be easily circumvented by pointing dns of origin to localhost, or with
@@ -609,12 +584,8 @@ async function crawlAndParseUrl(
// Delete the old assets if any
await Promise.all([
- oldScreenshotAssetId
- ? deleteAsset({ userId, assetId: oldScreenshotAssetId }).catch(() => ({}))
- : {},
- oldImageAssetId
- ? deleteAsset({ userId, assetId: oldImageAssetId }).catch(() => ({}))
- : {},
+ silentDeleteAsset(userId, oldScreenshotAssetId),
+ silentDeleteAsset(userId, oldImageAssetId),
]);
return async () => {
@@ -641,9 +612,7 @@ async function crawlAndParseUrl(
);
});
if (oldFullPageArchiveAssetId) {
- await deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
- () => ({}),
- );
+ silentDeleteAsset(userId, oldFullPageArchiveAssetId);
}
}
};
@@ -713,17 +682,9 @@ async function runCrawler(job: DequeuedJob) {
// Update the search index
await triggerSearchReindex(bookmarkId);
+ // Trigger a potential download of a video from the URL
+ await triggerVideoWorker(bookmarkId, url);
+
// Do the archival as a separate last step as it has the potential for failure
await archivalLogic();
}
-
-async function updateAsset(
- oldAssetId: string | undefined,
- newAsset: DBAssetType,
- txn: HoarderDBTransaction,
-) {
- if (oldAssetId) {
- await txn.delete(assets).where(eq(assets.id, oldAssetId));
- }
- await txn.insert(assets).values(newAsset);
-}
diff --git a/apps/workers/index.ts b/apps/workers/index.ts
index f9a05e59..3b5896e4 100644
--- a/apps/workers/index.ts
+++ b/apps/workers/index.ts
@@ -10,30 +10,39 @@ import { CrawlerWorker } from "./crawlerWorker";
import { shutdownPromise } from "./exit";
import { OpenAiWorker } from "./openaiWorker";
import { SearchIndexingWorker } from "./searchWorker";
+import { VideoWorker } from "./videoWorker";
async function main() {
logger.info(`Workers version: ${serverConfig.serverVersion ?? "not set"}`);
runQueueDBMigrations();
- const [crawler, openai, search, tidyAssets] = [
+ const [crawler, openai, search, tidyAssets, video] = [
await CrawlerWorker.build(),
OpenAiWorker.build(),
SearchIndexingWorker.build(),
TidyAssetsWorker.build(),
+ VideoWorker.build(),
];
await Promise.any([
- Promise.all([crawler.run(), openai.run(), search.run(), tidyAssets.run()]),
+ Promise.all([
+ crawler.run(),
+ openai.run(),
+ search.run(),
+ tidyAssets.run(),
+ video.run(),
+ ]),
shutdownPromise,
]);
logger.info(
- "Shutting down crawler, openai, tidyAssets and search workers ...",
+ "Shutting down crawler, openai, tidyAssets, video and search workers ...",
);
crawler.stop();
openai.stop();
search.stop();
tidyAssets.stop();
+ video.stop();
}
main();
diff --git a/apps/workers/videoWorker.ts b/apps/workers/videoWorker.ts
new file mode 100644
index 00000000..5448f0fa
--- /dev/null
+++ b/apps/workers/videoWorker.ts
@@ -0,0 +1,202 @@
+import fs from "fs";
+import * as os from "os";
+import path from "path";
+import { execa } from "execa";
+import { DequeuedJob, Runner } from "liteque";
+
+import { db } from "@hoarder/db";
+import { AssetTypes } from "@hoarder/db/schema";
+import {
+ ASSET_TYPES,
+ getAssetSize,
+ newAssetId,
+ saveAssetFromFile,
+} from "@hoarder/shared/assetdb";
+import serverConfig from "@hoarder/shared/config";
+import logger from "@hoarder/shared/logger";
+import { VideoWorkerQueue, ZVideoRequest } from "@hoarder/shared/queues";
+
+import { withTimeout } from "./utils";
+import { getBookmarkDetails, updateAsset } from "./workerUtils";
+
+const TMP_FOLDER = path.join(os.tmpdir(), "video_downloads");
+
+export class VideoWorker {
+ static build() {
+ logger.info("Starting video worker ...");
+
+ return new Runner(
+ VideoWorkerQueue,
+ {
+ run: withTimeout(
+ runWorker,
+ /* timeoutSec */ serverConfig.crawler.downloadVideoTimeout,
+ ),
+ onComplete: async (job) => {
+ const jobId = job?.id ?? "unknown";
+ logger.info(
+ `[VideoCrawler][${jobId}] Video Download Completed successfully`,
+ );
+ return Promise.resolve();
+ },
+ onError: async (job) => {
+ const jobId = job?.id ?? "unknown";
+ logger.error(
+ `[VideoCrawler][${jobId}] Video Download job failed: ${job.error}`,
+ );
+ return Promise.resolve();
+ },
+ },
+ {
+ pollIntervalMs: 1000,
+ timeoutSecs: serverConfig.crawler.downloadVideoTimeout,
+ concurrency: 1,
+ },
+ );
+ }
+}
+
+function prepareYtDlpArguments(url: string, assetPath: string) {
+ // TODO allow custom commandline arguments?
+ const ytDlpArguments = [url];
+ if (serverConfig.crawler.maxVideoDownloadSize > 0) {
+ ytDlpArguments.push(
+ "-f",
+ `best[filesize<${serverConfig.crawler.maxVideoDownloadSize}M]`,
+ );
+ }
+ ytDlpArguments.push("-o", assetPath);
+ ytDlpArguments.push("--no-playlist");
+ return ytDlpArguments;
+}
+
+async function runWorker(job: DequeuedJob) {
+ const jobId = job.id ?? "unknown";
+ const { bookmarkId } = job.data;
+
+ const {
+ url,
+ userId,
+ videoAssetId: oldVideoAssetId,
+ } = await getBookmarkDetails(bookmarkId);
+
+ if (!serverConfig.crawler.downloadVideo) {
+ logger.info(
+ `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it is disabled in the config.`,
+ );
+ return;
+ }
+
+ const videoAssetId = newAssetId();
+ let assetPath = `${TMP_FOLDER}/${videoAssetId}`;
+ await fs.promises.mkdir(TMP_FOLDER, { recursive: true });
+
+ const ytDlpArguments = prepareYtDlpArguments(url, assetPath);
+
+ try {
+ logger.info(
+ `[VideoCrawler][${jobId}] Attempting to download a file from "${url}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`,
+ );
+
+ await execa`yt-dlp ${ytDlpArguments}`;
+ const downloadPath = await findAssetFile(videoAssetId);
+ if (!downloadPath) {
+ logger.info(
+ "[VideoCrawler][${jobId}] yt-dlp didn't download anything. Skipping ...",
+ );
+ return;
+ }
+ assetPath = downloadPath;
+ } catch (e) {
+ const err = e as Error;
+ if (err.message.includes("ERROR: Unsupported URL:")) {
+ logger.info(
+ `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it's not one of the supported yt-dlp URLs`,
+ );
+ return;
+ }
+ console.log(JSON.stringify(err));
+ logger.error(
+ `[VideoCrawler][${jobId}] Failed to download a file from "${url}" to "${assetPath}"`,
+ );
+ await deleteLeftOverAssetFile(jobId, videoAssetId);
+ return;
+ }
+
+ logger.info(
+ `[VideoCrawler][${jobId}] Finished downloading a file from "${url}" to "${assetPath}"`,
+ );
+ await saveAssetFromFile({
+ userId,
+ assetId: videoAssetId,
+ assetPath,
+ metadata: { contentType: ASSET_TYPES.VIDEO_MP4 },
+ });
+
+ await db.transaction(async (txn) => {
+ await updateAsset(
+ oldVideoAssetId,
+ {
+ id: videoAssetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_VIDEO,
+ contentType: ASSET_TYPES.VIDEO_MP4,
+ size: await getAssetSize({ userId, assetId: videoAssetId }),
+ },
+ txn,
+ );
+ });
+
+ logger.info(
+ `[VideoCrawler][${jobId}] Finished downloading video from "${url}" and adding it to the database`,
+ );
+}
+
+/**
+ * Deletes leftover assets in case the download fails
+ *
+ * @param jobId the id of the job
+ * @param assetId the id of the asset to delete
+ */
+async function deleteLeftOverAssetFile(
+ jobId: string,
+ assetId: string,
+): Promise {
+ let assetFile;
+ try {
+ assetFile = await findAssetFile(assetId);
+ } catch {
+ // ignore exception, no asset file was found
+ return;
+ }
+ if (!assetFile) {
+ return;
+ }
+ logger.info(
+ `[VideoCrawler][${jobId}] Deleting leftover video asset "${assetFile}".`,
+ );
+ try {
+ await fs.promises.rm(assetFile);
+ } catch (e) {
+ logger.error(
+ `[VideoCrawler][${jobId}] Failed deleting leftover video asset "${assetFile}".`,
+ );
+ }
+}
+
+/**
+ * yt-dlp automatically adds a file ending to the passed in filename --> we have to search it again in the folder
+ *
+ * @param assetId the id of the asset to search
+ * @returns the path to the downloaded asset
+ */
+async function findAssetFile(assetId: string): Promise {
+ const files = await fs.promises.readdir(TMP_FOLDER);
+ for (const file of files) {
+ if (file.startsWith(assetId)) {
+ return path.join(TMP_FOLDER, file);
+ }
+ }
+ return null;
+}
diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts
new file mode 100644
index 00000000..e93d241b
--- /dev/null
+++ b/apps/workers/workerUtils.ts
@@ -0,0 +1,48 @@
+import { eq } from "drizzle-orm";
+
+import { db, HoarderDBTransaction } from "@hoarder/db";
+import { assets, AssetTypes, bookmarks } from "@hoarder/db/schema";
+
+type DBAssetType = typeof assets.$inferInsert;
+
+export async function updateAsset(
+ oldAssetId: string | undefined,
+ newAsset: DBAssetType,
+ txn: HoarderDBTransaction,
+) {
+ if (oldAssetId) {
+ await txn.delete(assets).where(eq(assets.id, oldAssetId));
+ }
+
+ await txn.insert(assets).values(newAsset);
+}
+
+export async function getBookmarkDetails(bookmarkId: string) {
+ const bookmark = await db.query.bookmarks.findFirst({
+ where: eq(bookmarks.id, bookmarkId),
+ with: {
+ link: true,
+ assets: true,
+ },
+ });
+
+ if (!bookmark || !bookmark.link) {
+ throw new Error("The bookmark either doesn't exist or is not a link");
+ }
+ return {
+ url: bookmark.link.url,
+ userId: bookmark.userId,
+ screenshotAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
+ )?.id,
+ imageAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
+ )?.id,
+ fullPageArchiveAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE,
+ )?.id,
+ videoAssetId: bookmark.assets.find(
+ (a) => a.assetType == AssetTypes.LINK_VIDEO,
+ )?.id,
+ };
+}
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 777981b3..44d1a09f 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -67,7 +67,7 @@ COPY --chmod=755 ./docker/root/etc/s6-overlay /etc/s6-overlay
######################
# Install runtime deps
######################
-RUN apk add --no-cache monolith
+RUN apk add --no-cache monolith yt-dlp
######################
# Prepare the web app
diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
index 8e66a407..c8fb4cc2 100644
--- a/docs/docs/03-configuration.md
+++ b/docs/docs/03-configuration.md
@@ -62,18 +62,21 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
## Crawler Configs
-| Name | Required | Default | Description |
-| ----------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. |
-| BROWSER_WEB_URL | No | Not set | The browser's http debugging address. The worker will talk to this endpoint to resolve the debugging console's websocket address. If you already have the websocket address, use `BROWSER_WEBSOCKET_URL` instead. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). |
-| BROWSER_WEBSOCKET_URL | No | Not set | The websocket address of browser's debugging console. If you want to use [browserless](https://browserless.io), use their websocket address here. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). |
-| BROWSER_CONNECT_ONDEMAND | No | false | If set to false, the crawler will proactively connect to the browser instance and always maintain an active connection. If set to true, the browser will be launched on demand only whenever a crawling is requested. Set to true if you're using a service that provides you with browser instances on demand. |
-| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. |
-| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. |
-| CRAWLER_FULL_PAGE_SCREENSHOT | No | false | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page |
-| CRAWLER_FULL_PAGE_ARCHIVE | No | false | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived. |
-| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
-| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
+| Name | Required | Default | Description |
+| ---------------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. |
+| BROWSER_WEB_URL | No | Not set | The browser's http debugging address. The worker will talk to this endpoint to resolve the debugging console's websocket address. If you already have the websocket address, use `BROWSER_WEBSOCKET_URL` instead. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). |
+| BROWSER_WEBSOCKET_URL | No | Not set | The websocket address of browser's debugging console. If you want to use [browserless](https://browserless.io), use their websocket address here. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). |
+| BROWSER_CONNECT_ONDEMAND | No | false | If set to false, the crawler will proactively connect to the browser instance and always maintain an active connection. If set to true, the browser will be launched on demand only whenever a crawling is requested. Set to true if you're using a service that provides you with browser instances on demand. |
+| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. |
+| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. |
+| CRAWLER_FULL_PAGE_SCREENSHOT | No | false | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page |
+| CRAWLER_FULL_PAGE_ARCHIVE | No | false | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived. |
+| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
+| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
+| CRAWLER_VIDEO_DOWNLOAD | No | false | Whether to download videos from the page or not (using yt-dlp) |
+| CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE | No | 50 | The maximum file size for the downloaded video. The quality will be chosen accordingly. Use -1 to disable the limit. |
+| CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC | No | 600 | How long to wait for the video download to finish |
## OCR Configs
diff --git a/packages/db/schema.ts b/packages/db/schema.ts
index 033295bf..10c69d9d 100644
--- a/packages/db/schema.ts
+++ b/packages/db/schema.ts
@@ -165,6 +165,7 @@ export const enum AssetTypes {
LINK_BANNER_IMAGE = "linkBannerImage",
LINK_SCREENSHOT = "linkScreenshot",
LINK_FULL_PAGE_ARCHIVE = "linkFullPageArchive",
+ LINK_VIDEO = "linkVideo",
BOOKMARK_ASSET = "bookmarkAsset",
UNKNOWN = "unknown",
}
@@ -179,6 +180,7 @@ export const assets = sqliteTable(
AssetTypes.LINK_BANNER_IMAGE,
AssetTypes.LINK_SCREENSHOT,
AssetTypes.LINK_FULL_PAGE_ARCHIVE,
+ AssetTypes.LINK_VIDEO,
AssetTypes.BOOKMARK_ASSET,
AssetTypes.UNKNOWN,
],
diff --git a/packages/open-api/hoarder-openapi-spec.json b/packages/open-api/hoarder-openapi-spec.json
index a93a0b9c..eac98326 100644
--- a/packages/open-api/hoarder-openapi-spec.json
+++ b/packages/open-api/hoarder-openapi-spec.json
@@ -140,6 +140,10 @@
"type": "string",
"nullable": true
},
+ "videoAssetId": {
+ "type": "string",
+ "nullable": true
+ },
"favicon": {
"type": "string",
"nullable": true,
@@ -245,6 +249,7 @@
"screenshot",
"bannerImage",
"fullPageArchive",
+ "video",
"bookmarkAsset",
"unknown"
]
@@ -488,6 +493,10 @@
"type": "string",
"nullable": true
},
+ "videoAssetId": {
+ "type": "string",
+ "nullable": true
+ },
"favicon": {
"type": "string",
"nullable": true,
diff --git a/packages/shared/assetdb.ts b/packages/shared/assetdb.ts
index 64413e9f..fb7d2461 100644
--- a/packages/shared/assetdb.ts
+++ b/packages/shared/assetdb.ts
@@ -13,6 +13,7 @@ export const enum ASSET_TYPES {
IMAGE_WEBP = "image/webp",
APPLICATION_PDF = "application/pdf",
TEXT_HTML = "text/html",
+ VIDEO_MP4 = "video/mp4",
}
export const IMAGE_ASSET_TYPES: Set = new Set([
@@ -31,6 +32,7 @@ export const SUPPORTED_UPLOAD_ASSET_TYPES: Set = new Set([
export const SUPPORTED_ASSET_TYPES: Set = new Set([
...SUPPORTED_UPLOAD_ASSET_TYPES,
ASSET_TYPES.TEXT_HTML,
+ ASSET_TYPES.VIDEO_MP4,
]);
function getAssetDir(userId: string, assetId: string) {
@@ -152,6 +154,20 @@ export async function getAssetSize({
return stat.size;
}
+/**
+ * Deletes the passed in asset if it exists and ignores any errors
+ * @param userId the id of the user the asset belongs to
+ * @param assetId the id of the asset to delete
+ */
+export async function silentDeleteAsset(
+ userId: string,
+ assetId: string | undefined,
+) {
+ if (assetId) {
+ await deleteAsset({ userId, assetId }).catch(() => ({}));
+ }
+}
+
export async function deleteAsset({
userId,
assetId,
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
index 4b51d15d..35d3df54 100644
--- a/packages/shared/config.ts
+++ b/packages/shared/config.ts
@@ -42,6 +42,9 @@ const allEnv = z.object({
CRAWLER_STORE_SCREENSHOT: stringBool("true"),
CRAWLER_FULL_PAGE_SCREENSHOT: stringBool("false"),
CRAWLER_FULL_PAGE_ARCHIVE: stringBool("false"),
+ CRAWLER_VIDEO_DOWNLOAD: stringBool("false"),
+ CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE: z.coerce.number().default(50),
+ CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC: z.coerce.number().default(10 * 60),
MEILI_ADDR: z.string().optional(),
MEILI_MASTER_KEY: z.string().default(""),
LOG_LEVEL: z.string().default("debug"),
@@ -98,6 +101,9 @@ const serverConfigSchema = allEnv.transform((val) => {
storeScreenshot: val.CRAWLER_STORE_SCREENSHOT,
fullPageScreenshot: val.CRAWLER_FULL_PAGE_SCREENSHOT,
fullPageArchive: val.CRAWLER_FULL_PAGE_ARCHIVE,
+ downloadVideo: val.CRAWLER_VIDEO_DOWNLOAD,
+ maxVideoDownloadSize: val.CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE,
+ downloadVideoTimeout: val.CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC,
},
ocr: {
langs: val.OCR_LANGS,
diff --git a/packages/shared/queues.ts b/packages/shared/queues.ts
index 0cb30aae..6189a633 100644
--- a/packages/shared/queues.ts
+++ b/packages/shared/queues.ts
@@ -93,3 +93,26 @@ export async function triggerSearchDeletion(bookmarkId: string) {
type: "delete",
});
}
+
+export const zvideoRequestSchema = z.object({
+ bookmarkId: z.string(),
+ url: z.string(),
+});
+export type ZVideoRequest = z.infer;
+
+export const VideoWorkerQueue = new SqliteQueue(
+ "video_queue",
+ queueDB,
+ {
+ defaultJobArgs: {
+ numRetries: 5,
+ },
+ },
+);
+
+export async function triggerVideoWorker(bookmarkId: string, url: string) {
+ await VideoWorkerQueue.enqueue({
+ bookmarkId,
+ url,
+ });
+}
diff --git a/packages/shared/types/bookmarks.ts b/packages/shared/types/bookmarks.ts
index c731cb32..2d46684e 100644
--- a/packages/shared/types/bookmarks.ts
+++ b/packages/shared/types/bookmarks.ts
@@ -15,6 +15,7 @@ export const zAssetTypesSchema = z.enum([
"screenshot",
"bannerImage",
"fullPageArchive",
+ "video",
"bookmarkAsset",
"unknown",
]);
@@ -34,6 +35,7 @@ export const zBookmarkedLinkSchema = z.object({
imageAssetId: z.string().nullish(),
screenshotAssetId: z.string().nullish(),
fullPageArchiveAssetId: z.string().nullish(),
+ videoAssetId: z.string().nullish(),
favicon: z.string().url().nullish(),
htmlContent: z.string().nullish(),
crawledAt: z.date().nullish(),
diff --git a/packages/trpc/lib/attachments.ts b/packages/trpc/lib/attachments.ts
index 175947f8..0fd41d1b 100644
--- a/packages/trpc/lib/attachments.ts
+++ b/packages/trpc/lib/attachments.ts
@@ -8,6 +8,7 @@ export function mapDBAssetTypeToUserType(assetType: AssetTypes): ZAssetType {
[AssetTypes.LINK_SCREENSHOT]: "screenshot",
[AssetTypes.LINK_FULL_PAGE_ARCHIVE]: "fullPageArchive",
[AssetTypes.LINK_BANNER_IMAGE]: "bannerImage",
+ [AssetTypes.LINK_VIDEO]: "video",
[AssetTypes.BOOKMARK_ASSET]: "bookmarkAsset",
[AssetTypes.UNKNOWN]: "bannerImage",
};
@@ -21,6 +22,7 @@ export function mapSchemaAssetTypeToDB(
screenshot: AssetTypes.LINK_SCREENSHOT,
fullPageArchive: AssetTypes.LINK_FULL_PAGE_ARCHIVE,
bannerImage: AssetTypes.LINK_BANNER_IMAGE,
+ video: AssetTypes.LINK_VIDEO,
bookmarkAsset: AssetTypes.BOOKMARK_ASSET,
unknown: AssetTypes.UNKNOWN,
};
@@ -32,6 +34,7 @@ export function humanFriendlyNameForAssertType(type: ZAssetType) {
screenshot: "Screenshot",
fullPageArchive: "Full Page Archive",
bannerImage: "Banner Image",
+ video: "Video",
bookmarkAsset: "Bookmark Asset",
unknown: "Unknown",
};
@@ -43,6 +46,7 @@ export function isAllowedToAttachAsset(type: ZAssetType) {
screenshot: true,
fullPageArchive: false,
bannerImage: true,
+ video: false,
bookmarkAsset: false,
unknown: false,
};
@@ -54,6 +58,7 @@ export function isAllowedToDetachAsset(type: ZAssetType) {
screenshot: true,
fullPageArchive: true,
bannerImage: true,
+ video: true,
bookmarkAsset: false,
unknown: false,
};
diff --git a/packages/trpc/routers/bookmarks.ts b/packages/trpc/routers/bookmarks.ts
index 80dd4bec..9a27c25a 100644
--- a/packages/trpc/routers/bookmarks.ts
+++ b/packages/trpc/routers/bookmarks.ts
@@ -215,6 +215,8 @@ function toZodSchema(bookmark: BookmarkQueryReturnType): ZBookmark {
imageAssetId: assets.find(
(a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
)?.id,
+ videoAssetId: assets.find((a) => a.assetType == AssetTypes.LINK_VIDEO)
+ ?.id,
...link,
};
break;
@@ -698,6 +700,9 @@ export const bookmarksAppRouter = router({
if (row.assets.assetType == AssetTypes.LINK_BANNER_IMAGE) {
content.imageAssetId = row.assets.id;
}
+ if (row.assets.assetType == AssetTypes.LINK_VIDEO) {
+ content.videoAssetId = row.assets.id;
+ }
acc[bookmarkId].content = content;
}
acc[bookmarkId].assets.push({