diff --git a/apps/web/app/api/assets/[assetId]/route.ts b/apps/web/app/api/assets/[assetId]/route.ts index f3cf1ab4..5bc3f479 100644 --- a/apps/web/app/api/assets/[assetId]/route.ts +++ b/apps/web/app/api/assets/[assetId]/route.ts @@ -16,10 +16,29 @@ export async function GET( assetId: params.assetId, }); - return new Response(asset, { - status: 200, - headers: { - "Content-type": metadata.contentType, - }, - }); + const range = request.headers.get("Range"); + if (range) { + const parts = range.replace(/bytes=/, "").split("-"); + const start = parseInt(parts[0], 10); + const end = parts[1] ? parseInt(parts[1], 10) : asset.length - 1; + + const chunk = asset.subarray(start, end + 1); + return new Response(chunk, { + status: 206, // Partial Content + headers: { + "Content-Range": `bytes ${start}-${end}/${asset.length}`, + "Accept-Ranges": "bytes", + "Content-Length": chunk.length.toString(), + "Content-type": metadata.contentType, + }, + }); + } else { + return new Response(asset, { + status: 200, + headers: { + "Content-Length": asset.length.toString(), + "Content-type": metadata.contentType, + }, + }); + } } diff --git a/apps/web/components/dashboard/preview/LinkContentSection.tsx b/apps/web/components/dashboard/preview/LinkContentSection.tsx index f2069821..bf0d8f90 100644 --- a/apps/web/components/dashboard/preview/LinkContentSection.tsx +++ b/apps/web/components/dashboard/preview/LinkContentSection.tsx @@ -60,6 +60,20 @@ function CachedContentSection({ link }: { link: ZBookmarkedLink }) { return {content}; } +function VideoSection({ link }: { link: ZBookmarkedLink }) { + return ( +
+
+ {/* eslint-disable-next-line jsx-a11y/media-has-caption -- captions not (yet) available */} + +
+
+ ); +} + export default function LinkContentSection({ bookmark, }: { @@ -76,6 +90,8 @@ export default function LinkContentSection({ content = ; } else if (section === "archive") { content = ; + } else if (section === "video") { + content = ; } else { content = ; } @@ -101,6 +117,9 @@ export default function LinkContentSection({ > Archive + + Video + diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index 6aa6b222..fee6ddd1 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -5,11 +5,9 @@ import type { Job } from "bullmq"; import type { Browser } from "puppeteer"; import { Readability } from "@mozilla/readability"; import { Mutex } from "async-mutex"; -import Database from "better-sqlite3"; import { Worker } from "bullmq"; import DOMPurify from "dompurify"; -import { eq, ExtractTablesWithRelations } from "drizzle-orm"; -import { SQLiteTransaction } from "drizzle-orm/sqlite-core"; +import { eq } from "drizzle-orm"; import { execa } from "execa"; import { isShuttingDown } from "exit"; import { JSDOM } from "jsdom"; @@ -27,15 +25,8 @@ import AdblockerPlugin from "puppeteer-extra-plugin-adblocker"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; import { withTimeout } from "utils"; -import type { ZCrawlLinkRequest } from "@hoarder/shared/queues"; -import { db, HoarderDBTransaction } from "@hoarder/db"; -import { - assets, - AssetTypes, - bookmarkAssets, - bookmarkLinks, - bookmarks, -} from "@hoarder/db/schema"; +import { db } from "@hoarder/db"; +import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema"; import { ASSET_TYPES, IMAGE_ASSET_TYPES, @@ -52,9 +43,14 @@ import { OpenAIQueue, queueConnectionDetails, triggerSearchReindex, + triggerVideoWorker, + ZCrawlLinkRequest, zCrawlLinkRequestSchema, } from "@hoarder/shared/queues"; import { BookmarkTypes } from "@hoarder/shared/types/bookmarks"; +import { DBAssetTypes } from "@hoarder/shared/utils/bookmarkUtils"; + +import { getBookmarkDetails, updateAsset } from "./workerUtils"; const metascraperParser = metascraper([ metascraperAmazon(), @@ -202,33 +198,6 @@ async function changeBookmarkStatus( .where(eq(bookmarkLinks.id, bookmarkId)); } -async function getBookmarkDetails(bookmarkId: string) { - const bookmark = await db.query.bookmarks.findFirst({ - where: eq(bookmarks.id, bookmarkId), - with: { - link: true, - assets: true, - }, - }); - - if (!bookmark || !bookmark.link) { - throw new Error("The bookmark either doesn't exist or not a link"); - } - return { - url: bookmark.link.url, - userId: bookmark.userId, - screenshotAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_SCREENSHOT, - )?.id, - imageAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE, - )?.id, - fullPageArchiveAssetId: bookmark.assets.find( - (a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE, - )?.id, - }; -} - /** * This provides some "basic" protection from malicious URLs. However, all of those * can be easily circumvented by pointing dns of origin to localhost, or with @@ -556,14 +525,14 @@ async function crawlAndParseUrl( screenshotAssetId, oldScreenshotAssetId, bookmarkId, - AssetTypes.LINK_SCREENSHOT, + DBAssetTypes.LINK_SCREENSHOT, txn, ); await updateAsset( imageAssetId, oldImageAssetId, bookmarkId, - AssetTypes.LINK_BANNER_IMAGE, + DBAssetTypes.LINK_BANNER_IMAGE, txn, ); }); @@ -588,7 +557,7 @@ async function crawlAndParseUrl( fullPageArchiveAssetId, oldFullPageArchiveAssetId, bookmarkId, - AssetTypes.LINK_FULL_PAGE_ARCHIVE, + DBAssetTypes.LINK_FULL_PAGE_ARCHIVE, txn, ); }); @@ -661,6 +630,8 @@ async function runCrawler(job: Job) { // Update the search index triggerSearchReindex(bookmarkId); + // Trigger a potential download of a video from the URL + triggerVideoWorker(bookmarkId, url); // Do the archival as a separate last step as it has the potential for failure await archivalLogic(); diff --git a/apps/workers/index.ts b/apps/workers/index.ts index 687d9ced..aca953cd 100644 --- a/apps/workers/index.ts +++ b/apps/workers/index.ts @@ -7,17 +7,19 @@ import { CrawlerWorker } from "./crawlerWorker"; import { shutdownPromise } from "./exit"; import { OpenAiWorker } from "./openaiWorker"; import { SearchIndexingWorker } from "./searchWorker"; +import { VideoWorker } from "./videoWorker"; async function main() { logger.info(`Workers version: ${serverConfig.serverVersion ?? "not set"}`); - const [crawler, openai, search] = [ + const [crawler, openai, search, video] = [ await CrawlerWorker.build(), OpenAiWorker.build(), SearchIndexingWorker.build(), + await VideoWorker.build(), ]; await Promise.any([ - Promise.all([crawler.run(), openai.run(), search.run()]), + Promise.all([crawler.run(), openai.run(), search.run(), video?.run()]), shutdownPromise, ]); } diff --git a/apps/workers/package.json b/apps/workers/package.json index b74f9ec9..c789ac4a 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -36,6 +36,7 @@ "puppeteer-extra-plugin-stealth": "^2.11.2", "tsx": "^4.7.1", "typescript": "^5.3.3", + "yt-dlp-wrap": "^2.3.12", "zod": "^3.22.4" }, "devDependencies": { diff --git a/apps/workers/videoWorker.ts b/apps/workers/videoWorker.ts new file mode 100644 index 00000000..49d70f5d --- /dev/null +++ b/apps/workers/videoWorker.ts @@ -0,0 +1,226 @@ +import fs from "fs"; +import path from "path"; +import type { Job } from "bullmq"; +import { Worker } from "bullmq"; +import YTDlpWrap from "yt-dlp-wrap"; + +import { db } from "@hoarder/db"; +import { newAssetId, saveAssetFromFile } from "@hoarder/shared/assetdb"; +import serverConfig from "@hoarder/shared/config"; +import logger from "@hoarder/shared/logger"; +import { + queueConnectionDetails, + VideoWorkerQueue, + ZVideoRequest, +} from "@hoarder/shared/queues"; +import { DBAssetTypes } from "@hoarder/shared/utils/bookmarkUtils"; + +import { withTimeout } from "./utils"; +import { getBookmarkDetails, updateAsset } from "./workerUtils"; + +const YT_DLP_BINARY = path.join( + serverConfig.dataDir, + process.platform === "win32" ? "yt-dlp.exe" : "yt-dlp", +); +const TMP_FOLDER = path.join(serverConfig.dataDir, "tmp"); + +export class VideoWorker { + static async build() { + logger.info("Starting video worker ..."); + + const ytDlpAvailable = await prepareYTDLP(); + if (!ytDlpAvailable) { + logger.error( + `[VideoCrawler] Unable to download yt-dlp. Video download will not be available!`, + ); + return; + } + + const worker = new Worker( + VideoWorkerQueue.name, + withTimeout( + runCrawler, + /* timeoutSec */ serverConfig.crawler.downloadVideoTimeout, + ), + { + concurrency: 1, + connection: queueConnectionDetails, + autorun: false, + }, + ); + + worker.on("completed", (job) => { + const jobId = job?.id ?? "unknown"; + logger.info( + `[VideoCrawler][${jobId}] Video Download Completed successfully`, + ); + }); + + worker.on("failed", (job, error) => { + const jobId = job?.id ?? "unknown"; + logger.error( + `[VideoCrawler][${jobId}] Video Download job failed: ${error}`, + ); + }); + + return worker; + } +} + +async function getYTDLPVersion() { + try { + const ytDlpWrap1 = new YTDlpWrap(YT_DLP_BINARY); + const version = await ytDlpWrap1.getVersion(); + logger.info(`[VideoCrawler] yt-dlp version available: ${version}`); + return version; + } catch (e) { + logger.error( + `[VideoCrawler] Failed to determine yt-dlp version. It probably does not exist: ${e}`, + ); + } +} + +async function prepareYTDLP(): Promise { + const version = await getYTDLPVersion(); + if (version) { + return true; + } + + logger.info( + `[VideoCrawler] Trying to download the latest version of yt-dlp to "${YT_DLP_BINARY}".`, + ); + try { + await YTDlpWrap.downloadFromGithub(YT_DLP_BINARY); + await getYTDLPVersion(); + return true; + } catch (e) { + logger.error( + `[VideoCrawler] Failed to download the latest version of yt-dlp`, + ); + return false; + } +} + +function prepareYtDlpArguments(url: string, assetPath: string) { + // TODO allow custom commandline arguments? + const ytDlpArguments = [url]; + if (serverConfig.crawler.maxVideoDownloadSize > 0) { + ytDlpArguments.push( + "-f", + `best[filesize<${serverConfig.crawler.maxVideoDownloadSize}M]`, + ); + } + ytDlpArguments.push("-o", assetPath); + return ytDlpArguments; +} + +async function runCrawler(job: Job) { + const jobId = job.id ?? "unknown"; + const { bookmarkId } = job.data; + + const { + url, + userId, + videoAssetId: oldVideoAssetId, + } = await getBookmarkDetails(bookmarkId); + + if (!serverConfig.crawler.downloadVideo) { + logger.info( + `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it is disabled in the config.`, + ); + return; + } + + const videoAssetId = newAssetId(); + let assetPath = `${TMP_FOLDER}/${videoAssetId}`; + await fs.promises.mkdir(TMP_FOLDER, { recursive: true }); + + const ytDlpArguments = prepareYtDlpArguments(url, assetPath); + + const ytDlpWrap = new YTDlpWrap(YT_DLP_BINARY); + try { + logger.info( + `[VideoCrawler][${jobId}] Attempting to download a file from "${url}" to "${assetPath}" using the following arguments:"${ytDlpArguments}"`, + ); + await ytDlpWrap.execPromise(ytDlpArguments); + assetPath = await findAssetFile(jobId, videoAssetId); + } catch (e) { + logger.error( + `[VideoCrawler][${jobId}] Failed to download a file from "${url}" to "${assetPath}": ${e}`, + ); + await deleteLeftOverAssetFile(jobId, videoAssetId); + return; + } + + logger.info( + `[VideoCrawler][${jobId}] Finished downloading a file from "${url}" to "${assetPath}"`, + ); + await saveAssetFromFile({ + userId, + assetId: videoAssetId, + assetPath, + metadata: { contentType: "video/mp4" }, + }); + + await db.transaction(async (txn) => { + await updateAsset( + videoAssetId, + oldVideoAssetId, + bookmarkId, + DBAssetTypes.LINK_VIDEO, + txn, + ); + }); + + logger.info( + `[VideoCrawler][${jobId}] Finished downloading video from "${url}" and adding it to the database`, + ); +} + +/** + * Deletes leftover assets in case the download fails + * + * @param jobId the id of the job + * @param assetId the id of the asset to delete + */ +async function deleteLeftOverAssetFile( + jobId: string, + assetId: string, +): Promise { + let assetFile; + try { + assetFile = await findAssetFile(jobId, assetId); + } catch { + // ignore exception, no asset file was found + return; + } + logger.info( + `[VideoCrawler][${jobId}] Deleting leftover video asset "${assetFile}".`, + ); + try { + await fs.promises.rm(assetFile); + } catch (e) { + logger.error( + `[VideoCrawler][${jobId}] Failed deleting leftover video asset "${assetFile}".`, + ); + } +} + +/** + * yt-dlp automatically adds a file ending to the passed in filename --> we have to search it again in the folder + * + * @param jobId id of the job + * @param assetId the id of the asset to search + * @returns the path to the downloaded asset + */ +async function findAssetFile(jobId: string, assetId: string): Promise { + const files = await fs.promises.readdir(TMP_FOLDER); + for (const file of files) { + if (file.startsWith(assetId)) { + return path.join(TMP_FOLDER, file); + } + } + throw Error( + `[VideoCrawler][${jobId}] Unable to find file with assetId ${assetId}`, + ); +} diff --git a/apps/workers/workerUtils.ts b/apps/workers/workerUtils.ts new file mode 100644 index 00000000..71934a3f --- /dev/null +++ b/apps/workers/workerUtils.ts @@ -0,0 +1,55 @@ +import { eq } from "drizzle-orm"; + +import { HoarderDBTransaction } from "@hoarder/db"; +import { db } from "@hoarder/db/drizzle"; +import { assets, bookmarks } from "@hoarder/db/schema"; +import { + DBAssetTypes, + mapAssetsToBookmarkFields, +} from "@hoarder/shared/utils/bookmarkUtils"; + +/** + * Removes the old asset and adds a new one instead + * @param newAssetId the new assetId to add + * @param oldAssetId the old assetId to remove (if it exists) + * @param bookmarkId the id of the bookmark the asset belongs to + * @param assetType the type of the asset + * @param txn the transaction where this update should happen in + */ +export async function updateAsset( + newAssetId: string | null, + oldAssetId: string | undefined, + bookmarkId: string, + assetType: DBAssetTypes, + txn: HoarderDBTransaction, +) { + if (newAssetId) { + if (oldAssetId) { + await txn.delete(assets).where(eq(assets.id, oldAssetId)); + } + await txn.insert(assets).values({ + id: newAssetId, + assetType, + bookmarkId, + }); + } +} + +export async function getBookmarkDetails(bookmarkId: string) { + const bookmark = await db.query.bookmarks.findFirst({ + where: eq(bookmarks.id, bookmarkId), + with: { + link: true, + assets: true, + }, + }); + + if (!bookmark || !bookmark.link) { + throw new Error("The bookmark either doesn't exist or not a link"); + } + return { + url: bookmark.link.url, + userId: bookmark.userId, + ...mapAssetsToBookmarkFields(bookmark.assets), + }; +} diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 277d182e..7574b72e 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -38,15 +38,18 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin ## Crawler Configs -| Name | Required | Default | Description | -| ----------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. | -| BROWSER_WEB_URL | No | Not set | The browser's http debugging address. The worker will talk to this endpoint to resolve the debugging console's websocket address. If you already have the websocket address, use `BROWSER_WEBSOCKET_URL` instead. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). | -| BROWSER_WEBSOCKET_URL | No | Not set | The websocket address of browser's debugging console. If you want to use [browserless](https://browserless.io), use their websocket address here. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). | -| BROWSER_CONNECT_ONDEMAND | No | false | If set to false, the crawler will proactively connect to the browser instance and always maintain an active connection. If set to true, the browser will be launched on demand only whenever a crawling is requested. Set to true if you're using a service that provides you with browser instances on demand. | -| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. | -| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. | -| CRAWLER_FULL_PAGE_SCREENSHOT | No | false | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page | -| CRAWLER_FULL_PAGE_ARCHIVE | No | false | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived. | -| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | -| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection | +| Name | Required | Default | Description | +|------------------------------------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. | +| BROWSER_WEB_URL | No | Not set | The browser's http debugging address. The worker will talk to this endpoint to resolve the debugging console's websocket address. If you already have the websocket address, use `BROWSER_WEBSOCKET_URL` instead. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). | +| BROWSER_WEBSOCKET_URL | No | Not set | The websocket address of browser's debugging console. If you want to use [browserless](https://browserless.io), use their websocket address here. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). | +| BROWSER_CONNECT_ONDEMAND | No | false | If set to false, the crawler will proactively connect to the browser instance and always maintain an active connection. If set to true, the browser will be launched on demand only whenever a crawling is requested. Set to true if you're using a service that provides you with browser instances on demand. | +| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. | +| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. | +| CRAWLER_FULL_PAGE_SCREENSHOT | No | false | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page | +| CRAWLER_FULL_PAGE_ARCHIVE | No | false | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived. | +| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | +| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection | +| CRAWLER_VIDEO_DOWNLOAD | No | false | Whether to download videos from the page or not (using yt-dlp) | +| CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE | No | 50 | The maximum file size for the downloaded video. The quality will be chosen accordingly. Use -1 to disable the limit. | +| CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC | No | 600 | How long to wait for the video download to finish | diff --git a/packages/db/schema.ts b/packages/db/schema.ts index c3e8e136..43b60a7c 100644 --- a/packages/db/schema.ts +++ b/packages/db/schema.ts @@ -12,6 +12,7 @@ import { } from "drizzle-orm/sqlite-core"; import { BookmarkTypes } from "@hoarder/shared/types/bookmarks"; +import { DBAssetTypes } from "@hoarder/shared/utils/bookmarkUtils"; function createdAtField() { return integer("createdAt", { mode: "timestamp" }) @@ -160,12 +161,6 @@ export const bookmarkLinks = sqliteTable( }, ); -export const enum AssetTypes { - LINK_BANNER_IMAGE = "linkBannerImage", - LINK_SCREENSHOT = "linkScreenshot", - LINK_FULL_PAGE_ARCHIVE = "linkFullPageArchive", -} - export const assets = sqliteTable( "assets", { @@ -173,9 +168,10 @@ export const assets = sqliteTable( id: text("id").notNull().primaryKey(), assetType: text("assetType", { enum: [ - AssetTypes.LINK_BANNER_IMAGE, - AssetTypes.LINK_SCREENSHOT, - AssetTypes.LINK_FULL_PAGE_ARCHIVE, + DBAssetTypes.LINK_BANNER_IMAGE, + DBAssetTypes.LINK_SCREENSHOT, + DBAssetTypes.LINK_FULL_PAGE_ARCHIVE, + DBAssetTypes.LINK_VIDEO, ], }).notNull(), bookmarkId: text("bookmarkId") diff --git a/packages/shared/assetdb.ts b/packages/shared/assetdb.ts index 64c1ca8c..480755cd 100644 --- a/packages/shared/assetdb.ts +++ b/packages/shared/assetdb.ts @@ -12,6 +12,7 @@ export const enum ASSET_TYPES { IMAGE_WEBP = "image/webp", APPLICATION_PDF = "application/pdf", TEXT_HTML = "text/html", + VIDEO_MP4 = "video/mp4", } export const IMAGE_ASSET_TYPES: Set = new Set([ @@ -24,6 +25,7 @@ export const IMAGE_ASSET_TYPES: Set = new Set([ export const SUPPORTED_UPLOAD_ASSET_TYPES: Set = new Set([ ...IMAGE_ASSET_TYPES, ASSET_TYPES.APPLICATION_PDF, + ASSET_TYPES.VIDEO_MP4, ]); // The assets that we support saving in the asset db diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 2c739a0c..77ca50f3 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -30,6 +30,9 @@ const allEnv = z.object({ CRAWLER_STORE_SCREENSHOT: stringBool("true"), CRAWLER_FULL_PAGE_SCREENSHOT: stringBool("false"), CRAWLER_FULL_PAGE_ARCHIVE: stringBool("false"), + CRAWLER_VIDEO_DOWNLOAD: stringBool("false"), + CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE: z.coerce.number().default(50), + CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC: z.coerce.number().default(10 * 60), MEILI_ADDR: z.string().optional(), MEILI_MASTER_KEY: z.string().default(""), LOG_LEVEL: z.string().default("debug"), @@ -76,6 +79,9 @@ const serverConfigSchema = allEnv.transform((val) => { storeScreenshot: val.CRAWLER_STORE_SCREENSHOT, fullPageScreenshot: val.CRAWLER_FULL_PAGE_SCREENSHOT, fullPageArchive: val.CRAWLER_FULL_PAGE_ARCHIVE, + downloadVideo: val.CRAWLER_VIDEO_DOWNLOAD, + maxVideoDownloadSize: val.CRAWLER_VIDEO_DOWNLOAD_MAX_SIZE, + downloadVideoTimeout: val.CRAWLER_VIDEO_DOWNLOAD_TIMEOUT_SEC, }, meilisearch: val.MEILI_ADDR ? { diff --git a/packages/shared/queues.ts b/packages/shared/queues.ts index 2b890755..715b7694 100644 --- a/packages/shared/queues.ts +++ b/packages/shared/queues.ts @@ -83,3 +83,27 @@ export function triggerSearchDeletion(bookmarkId: string) { type: "delete", }); } + +export const zvideoRequestSchema = z.object({ + bookmarkId: z.string(), + url: z.string(), +}); +export type ZVideoRequest = z.infer; + +export const VideoWorkerQueue = new Queue("video_queue", { + connection: queueConnectionDetails, + defaultJobOptions: { + attempts: 3, + backoff: { + type: "exponential", + delay: 500, + }, + }, +}); + +export function triggerVideoWorker(bookmarkId: string, url: string) { + VideoWorkerQueue.add("video_queue", { + bookmarkId, + url, + }); +} diff --git a/packages/shared/types/bookmarks.ts b/packages/shared/types/bookmarks.ts index 26cd9ceb..0a95074f 100644 --- a/packages/shared/types/bookmarks.ts +++ b/packages/shared/types/bookmarks.ts @@ -19,6 +19,7 @@ export const zBookmarkedLinkSchema = z.object({ imageAssetId: z.string().nullish(), screenshotAssetId: z.string().nullish(), fullPageArchiveAssetId: z.string().nullish(), + videoAssetId: z.string().nullish(), favicon: z.string().url().nullish(), htmlContent: z.string().nullish(), crawledAt: z.date().nullish(), diff --git a/packages/shared/utils/bookmarkUtils.ts b/packages/shared/utils/bookmarkUtils.ts new file mode 100644 index 00000000..3ef56ada --- /dev/null +++ b/packages/shared/utils/bookmarkUtils.ts @@ -0,0 +1,36 @@ +export const enum DBAssetTypes { + LINK_BANNER_IMAGE = "linkBannerImage", + LINK_SCREENSHOT = "linkScreenshot", + LINK_FULL_PAGE_ARCHIVE = "linkFullPageArchive", + LINK_VIDEO = "linkVideo", +} + +interface Asset { + id: string; + assetType: DBAssetTypes; +} + +export const enum BookmarkAssetType { + SCREENSHOT_ASSET_ID = "screenshotAssetId", + FULL_PAGE_ARCHIVE_ASSET_ID = "fullPageArchiveAssetId", + IMAGE_ASSET_ID = "imageAssetId", + VIDEO_ASSET_ID = "videoAssetId", +} + +export const ASSET_TYE_MAPPING: Record = { + [DBAssetTypes.LINK_SCREENSHOT]: BookmarkAssetType.SCREENSHOT_ASSET_ID, + [DBAssetTypes.LINK_FULL_PAGE_ARCHIVE]: + BookmarkAssetType.FULL_PAGE_ARCHIVE_ASSET_ID, + [DBAssetTypes.LINK_BANNER_IMAGE]: BookmarkAssetType.IMAGE_ASSET_ID, + [DBAssetTypes.LINK_VIDEO]: BookmarkAssetType.VIDEO_ASSET_ID, +}; + +export function mapAssetsToBookmarkFields( + assets: Asset | Asset[] = [], +): Record { + const assetsArray = Array.isArray(assets) ? assets : [assets]; + return assetsArray.reduce((result: Record, asset: Asset) => { + result[ASSET_TYE_MAPPING[asset.assetType]] = asset.id; + return result; + }, {}); +} diff --git a/packages/trpc/routers/bookmarks.ts b/packages/trpc/routers/bookmarks.ts index 0283f508..56efb6e4 100644 --- a/packages/trpc/routers/bookmarks.ts +++ b/packages/trpc/routers/bookmarks.ts @@ -11,7 +11,6 @@ import type { ZBookmarkTags } from "@hoarder/shared/types/tags"; import { db as DONT_USE_db } from "@hoarder/db"; import { assets, - AssetTypes, bookmarkAssets, bookmarkLinks, bookmarks, @@ -38,6 +37,7 @@ import { zNewBookmarkRequestSchema, zUpdateBookmarksRequestSchema, } from "@hoarder/shared/types/bookmarks"; +import { mapAssetsToBookmarkFields } from "@hoarder/shared/utils/bookmarkUtils"; import type { AuthedContext, Context } from "../index"; import { authedProcedure, router } from "../index"; @@ -74,25 +74,6 @@ export const ensureBookmarkOwnership = experimental_trpcMiddleware<{ return opts.next(); }); -interface Asset { - id: string; - assetType: AssetTypes; -} - -const ASSET_TYE_MAPPING: Record = { - [AssetTypes.LINK_SCREENSHOT]: "screenshotAssetId", - [AssetTypes.LINK_FULL_PAGE_ARCHIVE]: "fullPageArchiveAssetId", - [AssetTypes.LINK_BANNER_IMAGE]: "imageAssetId", -}; - -function mapAssetsToBookmarkFields(assets: Asset | Asset[] = []) { - const assetsArray = Array.isArray(assets) ? assets : [assets]; - return assetsArray.reduce((result: Record, asset: Asset) => { - result[ASSET_TYE_MAPPING[asset.assetType]] = asset.id; - return result; - }, {}); -} - async function getBookmark(ctx: AuthedContext, bookmarkId: string) { const bookmark = await ctx.db.query.bookmarks.findFirst({ where: and(eq(bookmarks.userId, ctx.user.id), eq(bookmarks.id, bookmarkId)), diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index eac8aee8..48947d82 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -734,6 +734,9 @@ importers: typescript: specifier: ^5.3.3 version: 5.3.3 + yt-dlp-wrap: + specifier: ^2.3.12 + version: 2.3.12 zod: specifier: ^3.22.4 version: 3.22.4 @@ -7136,16 +7139,20 @@ packages: glob@6.0.4: resolution: {integrity: sha512-MKZeRNyYZAVVVG1oZeLaWie1uweH40m9AZwIwxyPbTSX4hHrVYSzLg0Ro5Z5R7XKkIX+Cc6oD1rqeDJnwsB8/A==} + deprecated: Glob versions prior to v9 are no longer supported glob@7.1.6: resolution: {integrity: sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA==} + deprecated: Glob versions prior to v9 are no longer supported glob@7.2.3: resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==} + deprecated: Glob versions prior to v9 are no longer supported glob@8.1.0: resolution: {integrity: sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ==} engines: {node: '>=12'} + deprecated: Glob versions prior to v9 are no longer supported global-dirs@3.0.1: resolution: {integrity: sha512-NBcGGFbBA9s1VzD41QXDG+3++t9Mn5t1FpLdhESY6oKY4gYTFpX4wO3sqGUa0Srjtbfj3szX0RnemmrVRUdULA==} @@ -10708,18 +10715,22 @@ packages: rimraf@2.4.5: resolution: {integrity: sha512-J5xnxTyqaiw06JjMftq7L9ouA448dw/E7dKghkP9WpKNuwmARNNg+Gk8/u5ryb9N/Yo2+z3MCwuqFK/+qPOPfQ==} + deprecated: Rimraf versions prior to v4 are no longer supported hasBin: true rimraf@2.6.3: resolution: {integrity: sha512-mwqeW5XsA2qAejG46gYdENaxXjx9onRNCfn7L0duuP4hCuTIi/QO7PDK07KJfp1d+izWPrzEJDcSqBa0OZQriA==} + deprecated: Rimraf versions prior to v4 are no longer supported hasBin: true rimraf@2.7.1: resolution: {integrity: sha512-uWjbaKIK3T1OSVptzX7Nl6PvQ3qAGtKEtVRjRuazjfL3Bx5eI409VZSqgND+4UNnmzLVdPj9FqFJNPqBZFve4w==} + deprecated: Rimraf versions prior to v4 are no longer supported hasBin: true rimraf@3.0.2: resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} + deprecated: Rimraf versions prior to v4 are no longer supported hasBin: true rollup-plugin-terser@7.0.2: @@ -12434,6 +12445,9 @@ packages: resolution: {integrity: sha512-Ct97huExsu7cWeEjmrXlofevF8CvzUglJ4iGUet5B8xn1oumtAZBpHU4GzYuoE6PVqcZ5hghtBrSlhwHuR1Jmw==} engines: {node: '>=18'} + yt-dlp-wrap@2.3.12: + resolution: {integrity: sha512-P8fJ+6M1YjukyJENCTviNLiZ8mokxprR54ho3DsSKPWDcac489OjRiStGEARJr6un6ETS6goTn4CWl/b/rM3aA==} + zod@3.22.4: resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==} @@ -29560,6 +29574,9 @@ snapshots: yoctocolors@2.0.2: dev: false + yt-dlp-wrap@2.3.12: + dev: false + zod@3.22.4: {} zustand@4.5.1(@types/react@18.2.58)(react@18.2.0):