Skip to content

Commit

Permalink
Allow downloading more content from a webpage and index it #215
Browse files Browse the repository at this point in the history
Added a worker that allows downloading videos depending on the environment variables
refactored the code a bit
added new video asset
updated documentation
  • Loading branch information
kamtschatka committed Jun 30, 2024
1 parent c1ad57a commit 484f422
Show file tree
Hide file tree
Showing 16 changed files with 450 additions and 91 deletions.
31 changes: 25 additions & 6 deletions apps/web/app/api/assets/[assetId]/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,29 @@ export async function GET(
assetId: params.assetId,
});

return new Response(asset, {
status: 200,
headers: {
"Content-type": metadata.contentType,
},
});
const range = request.headers.get("Range");
if (range) {
const parts = range.replace(/bytes=/, "").split("-");
const start = parseInt(parts[0], 10);
const end = parts[1] ? parseInt(parts[1], 10) : asset.length - 1;

const chunk = asset.subarray(start, end + 1);
return new Response(chunk, {
status: 206, // Partial Content
headers: {
"Content-Range": `bytes ${start}-${end}/${asset.length}`,
"Accept-Ranges": "bytes",
"Content-Length": chunk.length.toString(),
"Content-type": metadata.contentType,
},
});
} else {
return new Response(asset, {
status: 200,
headers: {
"Content-Length": asset.length.toString(),
"Content-type": metadata.contentType,
},
});
}
}
19 changes: 19 additions & 0 deletions apps/web/components/dashboard/preview/LinkContentSection.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,20 @@ function CachedContentSection({ link }: { link: ZBookmarkedLink }) {
return <ScrollArea className="h-full">{content}</ScrollArea>;
}

function VideoSection({ link }: { link: ZBookmarkedLink }) {
return (
<div className="relative h-full w-full overflow-hidden">
<div className="absolute inset-0 h-full w-full">
{/* eslint-disable-next-line jsx-a11y/media-has-caption -- captions not (yet) available */}
<video className="m-auto max-h-full max-w-full" controls>
<source src={`/api/assets/${link.videoAssetId}`} />
Not supported by your browser
</video>
</div>
</div>
);
}

export default function LinkContentSection({
bookmark,
}: {
Expand All @@ -76,6 +90,8 @@ export default function LinkContentSection({
content = <CachedContentSection link={bookmark.content} />;
} else if (section === "archive") {
content = <FullPageArchiveSection link={bookmark.content} />;
} else if (section === "video") {
content = <VideoSection link={bookmark.content} />;
} else {
content = <ScreenshotSection link={bookmark.content} />;
}
Expand All @@ -101,6 +117,9 @@ export default function LinkContentSection({
>
Archive
</SelectItem>
<SelectItem value="video" disabled={!bookmark.content.videoAssetId}>
Video
</SelectItem>
</SelectGroup>
</SelectContent>
</Select>
Expand Down
55 changes: 13 additions & 42 deletions apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@ import type { Job } from "bullmq";
import type { Browser } from "puppeteer";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import Database from "better-sqlite3";
import { Worker } from "bullmq";
import DOMPurify from "dompurify";
import { eq, ExtractTablesWithRelations } from "drizzle-orm";
import { SQLiteTransaction } from "drizzle-orm/sqlite-core";
import { eq } from "drizzle-orm";
import { execa } from "execa";
import { isShuttingDown } from "exit";
import { JSDOM } from "jsdom";
Expand All @@ -27,15 +25,8 @@ import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";

import type { ZCrawlLinkRequest } from "@hoarder/shared/queues";
import { db, HoarderDBTransaction } from "@hoarder/db";
import {
assets,
AssetTypes,
bookmarkAssets,
bookmarkLinks,
bookmarks,
} from "@hoarder/db/schema";
import { db } from "@hoarder/db";
import { bookmarkAssets, bookmarkLinks, bookmarks } from "@hoarder/db/schema";
import {
ASSET_TYPES,
IMAGE_ASSET_TYPES,
Expand All @@ -52,9 +43,14 @@ import {
OpenAIQueue,
queueConnectionDetails,
triggerSearchReindex,
triggerVideoWorker,
ZCrawlLinkRequest,
zCrawlLinkRequestSchema,
} from "@hoarder/shared/queues";
import { BookmarkTypes } from "@hoarder/shared/types/bookmarks";
import { DBAssetTypes } from "@hoarder/shared/utils/bookmarkUtils";

import { getBookmarkDetails, updateAsset } from "./workerUtils";

const metascraperParser = metascraper([
metascraperAmazon(),
Expand Down Expand Up @@ -202,33 +198,6 @@ async function changeBookmarkStatus(
.where(eq(bookmarkLinks.id, bookmarkId));
}

async function getBookmarkDetails(bookmarkId: string) {
const bookmark = await db.query.bookmarks.findFirst({
where: eq(bookmarks.id, bookmarkId),
with: {
link: true,
assets: true,
},
});

if (!bookmark || !bookmark.link) {
throw new Error("The bookmark either doesn't exist or not a link");
}
return {
url: bookmark.link.url,
userId: bookmark.userId,
screenshotAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_SCREENSHOT,
)?.id,
imageAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_BANNER_IMAGE,
)?.id,
fullPageArchiveAssetId: bookmark.assets.find(
(a) => a.assetType == AssetTypes.LINK_FULL_PAGE_ARCHIVE,
)?.id,
};
}

/**
* This provides some "basic" protection from malicious URLs. However, all of those
* can be easily circumvented by pointing dns of origin to localhost, or with
Expand Down Expand Up @@ -556,14 +525,14 @@ async function crawlAndParseUrl(
screenshotAssetId,
oldScreenshotAssetId,
bookmarkId,
AssetTypes.LINK_SCREENSHOT,
DBAssetTypes.LINK_SCREENSHOT,
txn,
);
await updateAsset(
imageAssetId,
oldImageAssetId,
bookmarkId,
AssetTypes.LINK_BANNER_IMAGE,
DBAssetTypes.LINK_BANNER_IMAGE,
txn,
);
});
Expand All @@ -588,7 +557,7 @@ async function crawlAndParseUrl(
fullPageArchiveAssetId,
oldFullPageArchiveAssetId,
bookmarkId,
AssetTypes.LINK_FULL_PAGE_ARCHIVE,
DBAssetTypes.LINK_FULL_PAGE_ARCHIVE,
txn,
);
});
Expand Down Expand Up @@ -661,6 +630,8 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {

// Update the search index
triggerSearchReindex(bookmarkId);
// Trigger a potential download of a video from the URL
triggerVideoWorker(bookmarkId, url);

// Do the archival as a separate last step as it has the potential for failure
await archivalLogic();
Expand Down
6 changes: 4 additions & 2 deletions apps/workers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,19 @@ import { CrawlerWorker } from "./crawlerWorker";
import { shutdownPromise } from "./exit";
import { OpenAiWorker } from "./openaiWorker";
import { SearchIndexingWorker } from "./searchWorker";
import { VideoWorker } from "./videoWorker";

async function main() {
logger.info(`Workers version: ${serverConfig.serverVersion ?? "not set"}`);
const [crawler, openai, search] = [
const [crawler, openai, search, video] = [
await CrawlerWorker.build(),
OpenAiWorker.build(),
SearchIndexingWorker.build(),
await VideoWorker.build(),
];

await Promise.any([
Promise.all([crawler.run(), openai.run(), search.run()]),
Promise.all([crawler.run(), openai.run(), search.run(), video?.run()]),
shutdownPromise,
]);
}
Expand Down
1 change: 1 addition & 0 deletions apps/workers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"puppeteer-extra-plugin-stealth": "^2.11.2",
"tsx": "^4.7.1",
"typescript": "^5.3.3",
"yt-dlp-wrap": "^2.3.12",
"zod": "^3.22.4"
},
"devDependencies": {
Expand Down
Loading

0 comments on commit 484f422

Please sign in to comment.