Skip to content

Commit

Permalink
feature: Store crawling status code and allow users to find broken li…
Browse files Browse the repository at this point in the history
…nks. Fixes #169
  • Loading branch information
MohamedBassem committed Dec 8, 2024
1 parent a7b1386 commit 705d539
Show file tree
Hide file tree
Showing 9 changed files with 1,628 additions and 6 deletions.
131 changes: 131 additions & 0 deletions apps/web/app/settings/broken-links/page.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"use client";

import { ActionButton } from "@/components/ui/action-button";
import { FullPageSpinner } from "@/components/ui/full-page-spinner";
import {
Table,
TableBody,
TableCell,
TableHead,
TableHeader,
TableRow,
} from "@/components/ui/table";
import { toast } from "@/components/ui/use-toast";
import { RefreshCw, Trash2 } from "lucide-react";
import { useTranslation } from "react-i18next";

import {
useDeleteBookmark,
useRecrawlBookmark,
} from "@hoarder/shared-react/hooks/bookmarks";
import { api } from "@hoarder/shared-react/trpc";

export default function BrokenLinksPage() {
const { t } = useTranslation();

const apiUtils = api.useUtils();
const { data, isPending } = api.bookmarks.getBrokenLinks.useQuery();

const { mutate: deleteBookmark, isPending: isDeleting } = useDeleteBookmark({
onSuccess: () => {
toast({
description: t("toasts.bookmarks.deleted"),
});
apiUtils.bookmarks.getBrokenLinks.invalidate();
},
onError: () => {
toast({
description: t("common.something_went_wrong"),
variant: "destructive",
});
},
});

const { mutate: recrawlBookmark, isPending: isRecrawling } =
useRecrawlBookmark({
onSuccess: () => {
toast({
description: t("toasts.bookmarks.refetch"),
});
apiUtils.bookmarks.getBrokenLinks.invalidate();
},
onError: () => {
toast({
description: t("common.something_went_wrong"),
variant: "destructive",
});
},
});

return (
<div className="rounded-md border bg-background p-4">
<div className="flex items-center justify-between">
<div className="mb-2 text-lg font-medium">
{t("settings.broken_links.broken_links")}
</div>
</div>
<div className="mt-2">
{isPending && <FullPageSpinner />}
{!isPending && data && data.bookmarks.length == 0 && (
<p className="rounded-md bg-muted p-2 text-sm text-muted-foreground">
No broken links found
</p>
)}
{!isPending && data && data.bookmarks.length > 0 && (
<Table>
<TableHeader>
<TableRow>
<TableHead>{t("common.url")}</TableHead>
<TableHead>{t("common.created_at")}</TableHead>
<TableHead>
{t("settings.broken_links.last_crawled_at")}
</TableHead>
<TableHead>
{t("settings.broken_links.crawling_status")}
</TableHead>
<TableHead>{t("common.action")}</TableHead>
</TableRow>
</TableHeader>
<TableBody>
{data.bookmarks.map((b) => (
<TableRow key={b.id}>
<TableCell>{b.url}</TableCell>
<TableCell>{b.createdAt?.toLocaleString()}</TableCell>
<TableCell>{b.crawledAt?.toLocaleString()}</TableCell>
<TableCell>
{b.isCrawlingFailure ? (
<span className="text-red-500">Failed</span>
) : (
b.statusCode
)}
</TableCell>
<TableCell className="flex gap-2">
<ActionButton
variant="secondary"
loading={isRecrawling}
onClick={() => recrawlBookmark({ bookmarkId: b.id })}
className="flex items-center gap-2"
>
<RefreshCw className="size-4" />
{t("actions.recrawl")}
</ActionButton>
<ActionButton
variant="destructive"
onClick={() => deleteBookmark({ bookmarkId: b.id })}
loading={isDeleting}
className="flex items-center gap-2"
>
<Trash2 className="size-4" />
{t("actions.delete")}
</ActionButton>
</TableCell>
</TableRow>
))}
<TableRow></TableRow>
</TableBody>
</Table>
)}
</div>
</div>
);
}
6 changes: 6 additions & 0 deletions apps/web/components/settings/sidebar/items.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
ArrowLeft,
Download,
KeyRound,
Link,
Rss,
Sparkles,
User,
Expand Down Expand Up @@ -46,4 +47,9 @@ export const settingsSidebarItems = (
icon: <KeyRound size={18} />,
path: "/settings/api-keys",
},
{
name: t("settings.broken_links.broken_links"),
icon: <Link size={18} />,
path: "/settings/broken-links",
},
];
7 changes: 7 additions & 0 deletions apps/web/lib/i18n/locales/en/translation.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"unfavorite": "Unfavorite",
"delete": "Delete",
"refresh": "Refresh",
"recrawl": "Recrawl",
"download_full_page_archive": "Download Full Page Archive",
"edit_tags": "Edit Tags",
"add_to_list": "Add to List",
Expand Down Expand Up @@ -103,6 +104,12 @@
"new_api_key_desc": "Give your API key a unique name",
"key_success": "Key was successfully created",
"key_success_please_copy": "Please copy the key and store it somewhere safe. Once you close the dialog, you won't be able to access it again."
},
"broken_links": {
"broken_links": "Broken Links",
"last_crawled_at": "Last Crawled At",
"crawling_status": "Crawling Status",
"crawling_failed": "Crawling Failed"
}
},
"admin": {
Expand Down
10 changes: 6 additions & 4 deletions apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,12 @@ async function browserlessCrawlPage(jobId: string, url: string) {
const response = await fetch(url, {
signal: AbortSignal.timeout(5000),
});
if (!response.ok) {
throw new Error(`Failed to crawl page: ${response.status}`);
}
logger.info(
`[Crawler][${jobId}] Successfully fetched the content of "${url}". Status: ${response.status}, Size: ${response.size}`,
);
return {
htmlContent: await response.text(),
statusCode: response.status,
screenshot: undefined,
url: response.url,
};
Expand All @@ -260,6 +258,7 @@ async function crawlPage(
): Promise<{
htmlContent: string;
screenshot: Buffer | undefined;
statusCode: number;
url: string;
}> {
let browser: Browser | undefined;
Expand All @@ -282,7 +281,7 @@ async function crawlPage(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
);

await page.goto(url, {
const response = await page.goto(url, {
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
});
logger.info(
Expand Down Expand Up @@ -328,6 +327,7 @@ async function crawlPage(

return {
htmlContent,
statusCode: response?.status() ?? 0,
screenshot,
url: page.url(),
};
Expand Down Expand Up @@ -583,6 +583,7 @@ async function crawlAndParseUrl(
const {
htmlContent,
screenshot,
statusCode,
url: browserUrl,
} = await crawlPage(jobId, url);

Expand Down Expand Up @@ -618,6 +619,7 @@ async function crawlAndParseUrl(
content: readableContent?.textContent,
htmlContent: readableContent?.content,
crawledAt: new Date(),
crawlStatusCode: statusCode,
})
.where(eq(bookmarkLinks.id, bookmarkId));

Expand Down
1 change: 1 addition & 0 deletions packages/db/drizzle/0034_wet_the_stranger.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE `bookmarkLinks` ADD `crawlStatusCode` integer DEFAULT 200;
Loading

0 comments on commit 705d539

Please sign in to comment.