Skip to content

Commit

Permalink
feature: Archive videos using yt-dlp. Fixes hoarder-app#215 (hoarder-…
Browse files Browse the repository at this point in the history
…app#525)

* Allow downloading more content from a webpage and index it hoarder-app#215
Added a worker that allows downloading videos depending on the environment variables
refactored the code a bit
added new video asset
updated documentation

* Some tweaks

* Drop the dependency on the yt-dlp wrapper

* Update openapi specs

* Dont log an error when the url is not supported

* Better handle supported websites that dont download anything

---------

Co-authored-by: Mohamed Bassem <[email protected]>
  • Loading branch information
kamtschatka and MohamedBassem committed Nov 2, 2024
1 parent 1f0a703 commit 80fcf5a
Show file tree
Hide file tree
Showing 17 changed files with 403 additions and 71 deletions.
32 changes: 26 additions & 6 deletions apps/web/app/api/assets/[assetId]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,30 @@ export async function GET(
assetId: params.assetId,
});

return new Response(asset, {
status: 200,
headers: {
"Content-type": metadata.contentType,
},
});
const range = request.headers.get("Range");
if (range) {
const parts = range.replace(/bytes=/, "").split("-");
const start = parseInt(parts[0], 10);
const end = parts[1] ? parseInt(parts[1], 10) : asset.length - 1;

// TODO: Don't read the whole asset into memory in the first place
const chunk = asset.subarray(start, end + 1);
return new Response(chunk, {
status: 206, // Partial Content
headers: {
"Content-Range": `bytes ${start}-${end}/${asset.length}`,
"Accept-Ranges": "bytes",
"Content-Length": chunk.length.toString(),
"Content-type": metadata.contentType,
},
});
} else {
return new Response(asset, {
status: 200,
headers: {
"Content-Length": asset.length.toString(),
"Content-type": metadata.contentType,
},
});
}
}
2 changes: 2 additions & 0 deletions apps/web/components/dashboard/preview/AttachmentBox.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
Pencil,
Plus,
Trash2,
Video,
} from "lucide-react";

import {
Expand All @@ -44,6 +45,7 @@ export default function AttachmentBox({ bookmark }: { bookmark: ZBookmark }) {
screenshot: <Camera className="size-4" />,
fullPageArchive: <Archive className="size-4" />,
bannerImage: <Image className="size-4" />,
video: <Video className="size-4" />,
bookmarkAsset: <Paperclip className="size-4" />,
unknown: <Paperclip className="size-4" />,
};
Expand Down
19 changes: 19 additions & 0 deletions apps/web/components/dashboard/preview/LinkContentSection.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,20 @@ function CachedContentSection({ link }: { link: ZBookmarkedLink }) {
return <ScrollArea className="h-full">{content}</ScrollArea>;
}

function VideoSection({ link }: { link: ZBookmarkedLink }) {
return (
<div className="relative h-full w-full overflow-hidden">
<div className="absolute inset-0 h-full w-full">
{/* eslint-disable-next-line jsx-a11y/media-has-caption -- captions not (yet) available */}
<video className="m-auto max-h-full max-w-full" controls>
<source src={`/api/assets/${link.videoAssetId}`} />
Not supported by your browser
</video>
</div>
</div>
);
}

export default function LinkContentSection({
bookmark,
}: {
Expand All @@ -76,6 +90,8 @@ export default function LinkContentSection({
content = <CachedContentSection link={bookmark.content} />;
} else if (section === "archive") {
content = <FullPageArchiveSection link={bookmark.content} />;
} else if (section === "video") {
content = <VideoSection link={bookmark.content} />;
} else {
content = <ScreenshotSection link={bookmark.content} />;
}
Expand All @@ -101,6 +117,9 @@ export default function LinkContentSection({
>
Archive
</SelectItem>
<SelectItem value="video" disabled={!bookmark.content.videoAssetId}>
Video
</SelectItem>
</SelectGroup>
</SelectContent>
</Select>
Expand Down
59 changes: 10 additions & 49 deletions apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ import puppeteer from "puppeteer-extra";
import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";

import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
import { db, HoarderDBTransaction } from "@hoarder/db";
import { db } from "@hoarder/db";
import {
assets,
AssetTypes,
Expand All @@ -35,12 +36,12 @@ import {
} from "@hoarder/db/schema";
import {
ASSET_TYPES,
deleteAsset,
getAssetSize,
IMAGE_ASSET_TYPES,
newAssetId,
saveAsset,
saveAssetFromFile,
silentDeleteAsset,
SUPPORTED_UPLOAD_ASSET_TYPES,
} from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
Expand All @@ -49,6 +50,7 @@ import {
LinkCrawlerQueue,
OpenAIQueue,
triggerSearchReindex,
triggerVideoWorker,
zCrawlLinkRequestSchema,
} from "@hoarder/shared/queues";
import { BookmarkTypes } from "@hoarder/shared/types/bookmarks";
Expand Down Expand Up @@ -207,33 +209,6 @@ async function changeBookmarkStatus(
.where(eq(bookmarkLinks.id, bookmarkId));
}

async function getBookmarkDetails(bookmarkId: string) {
const bookmark = await db.query.bookmarks.findFirst({
where: eq(bookmarks.id, bookmarkId),
with: {
link: true,
assets: true,
},
});

if (!bookmark || !bookmark.link) {
throw new Error("The bookmark either doesn't exist or is not a link");
}
return {
url: bookmark.link.url,
userId: bookmark.userId,
screenshotAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
)?.id,
imageAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
)?.id,
fullPageArchiveAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE,
)?.id,
};
}

/**
* This provides some "basic" protection from malicious URLs. However, all of those
* can be easily circumvented by pointing dns of origin to localhost, or with
Expand Down Expand Up @@ -609,12 +584,8 @@ async function crawlAndParseUrl(

// Delete the old assets if any
await Promise.all([
oldScreenshotAssetId
? deleteAsset({ userId, assetId: oldScreenshotAssetId }).catch(() => ({}))
: {},
oldImageAssetId
? deleteAsset({ userId, assetId: oldImageAssetId }).catch(() => ({}))
: {},
silentDeleteAsset(userId, oldScreenshotAssetId),
silentDeleteAsset(userId, oldImageAssetId),
]);

return async () => {
Expand All @@ -641,9 +612,7 @@ async function crawlAndParseUrl(
);
});
if (oldFullPageArchiveAssetId) {
await deleteAsset({ userId, assetId: oldFullPageArchiveAssetId }).catch(
() => ({}),
);
silentDeleteAsset(userId, oldFullPageArchiveAssetId);
}
}
};
Expand Down Expand Up @@ -713,17 +682,9 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) {
// Update the search index
await triggerSearchReindex(bookmarkId);

// Trigger a potential download of a video from the URL
await triggerVideoWorker(bookmarkId, url);

// Do the archival as a separate last step as it has the potential for failure
await archivalLogic();
}

async function updateAsset(
oldAssetId: string | undefined,
newAsset: DBAssetType,
txn: HoarderDBTransaction,
) {
if (oldAssetId) {
await txn.delete(assets).where(eq(assets.id, oldAssetId));
}
await txn.insert(assets).values(newAsset);
}
15 changes: 12 additions & 3 deletions apps/workers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,39 @@ import { CrawlerWorker } from "./crawlerWorker";
import { shutdownPromise } from "./exit";
import { OpenAiWorker } from "./openaiWorker";
import { SearchIndexingWorker } from "./searchWorker";
import { VideoWorker } from "./videoWorker";

async function main() {
logger.info(`Workers version: ${serverConfig.serverVersion ?? "not set"}`);
runQueueDBMigrations();

const [crawler, openai, search, tidyAssets] = [
const [crawler, openai, search, tidyAssets, video] = [
await CrawlerWorker.build(),
OpenAiWorker.build(),
SearchIndexingWorker.build(),
TidyAssetsWorker.build(),
VideoWorker.build(),
];

await Promise.any([
Promise.all([crawler.run(), openai.run(), search.run(), tidyAssets.run()]),
Promise.all([
crawler.run(),
openai.run(),
search.run(),
tidyAssets.run(),
video.run(),
]),
shutdownPromise,
]);
logger.info(
"Shutting down crawler, openai, tidyAssets and search workers ...",
"Shutting down crawler, openai, tidyAssets, video and search workers ...",
);

crawler.stop();
openai.stop();
search.stop();
tidyAssets.stop();
video.stop();
}

main();
Loading

0 comments on commit 80fcf5a

Please sign in to comment.