Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: Add PDF support #88

Merged
merged 10 commits into from
Apr 11, 2024
6 changes: 4 additions & 2 deletions apps/mobile/lib/upload.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export function useUploadAsset(
mutationFn: async (file: { type: string; name: string; uri: string }) => {
const formData = new FormData();
// @ts-expect-error This is a valid api in react native
formData.append("image", {
formData.append("file", {
uri: file.uri,
name: file.name,
type: file.type,
Expand All @@ -57,7 +57,9 @@ export function useUploadAsset(
},
onSuccess: (resp) => {
const assetId = resp.assetId;
createBookmark({ type: "asset", assetId, assetType: "image" });
const assetType =
resp.contentType === "application/pdf" ? "pdf" : "image";
createBookmark({ type: "asset", assetId, assetType });
},
onError: (e) => {
if (options.onError) {
Expand Down
2 changes: 1 addition & 1 deletion apps/mobile/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"expo-navigation-bar": "~2.8.1",
"expo-router": "~3.4.8",
"expo-secure-store": "^12.8.1",
"expo-share-intent": "^1.1.0",
"expo-share-intent": "1.1.0",
"expo-status-bar": "~1.11.1",
"expo-system-ui": "^2.9.3",
"expo-web-browser": "^12.8.2",
Expand Down
7 changes: 5 additions & 2 deletions apps/web/app/api/assets/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ const SUPPORTED_ASSET_TYPES = new Set([
"image/jpeg",
"image/png",
"image/webp",
"application/pdf",
]);

const MAX_UPLOAD_SIZE_BYTES = serverConfig.maxAssetSizeMb * 1024 * 1024;
Expand All @@ -26,7 +27,7 @@ export async function POST(request: Request) {
});
}
const formData = await request.formData();
const data = formData.get("image");
const data = formData.get("file") ?? formData.get("image");
let buffer;
let contentType;
if (data instanceof File) {
Expand All @@ -46,17 +47,19 @@ export async function POST(request: Request) {
}

const assetId = crypto.randomUUID();
const fileName = data.name;

await saveAsset({
userId: ctx.user.id,
assetId,
metadata: { contentType },
metadata: { contentType, fileName },
asset: buffer,
});

return Response.json({
assetId,
contentType,
size: buffer.byteLength,
fileName,
} satisfies ZUploadResponse);
}
7 changes: 4 additions & 3 deletions apps/web/components/dashboard/UploadDropzone.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ function useUploadAsset({ onComplete }: { onComplete: () => void }) {
const { mutateAsync: runUpload } = useMutation({
mutationFn: async (file: File) => {
const formData = new FormData();
formData.append("image", file);
formData.append("file", file);
const resp = await fetch("/api/assets", {
method: "POST",
body: formData,
Expand All @@ -40,8 +40,9 @@ function useUploadAsset({ onComplete }: { onComplete: () => void }) {
return zUploadResponseSchema.parse(await resp.json());
},
onSuccess: async (resp) => {
const assetId = resp.assetId;
return createBookmark({ type: "asset", assetId, assetType: "image" });
const assetType =
resp.contentType === "application/pdf" ? "pdf" : "image";
return createBookmark({ ...resp, type: "asset", assetType });
},
onError: (error, req) => {
const err = zUploadErrorSchema.parse(JSON.parse(error.message));
Expand Down
7 changes: 7 additions & 0 deletions apps/web/components/dashboard/bookmarks/AssetCard.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ export default function AssetCard({
/>
</div>
)}
{bookmarkedAsset.assetType == "pdf" && (
<iframe
title={bookmarkedAsset.assetId}
className="h-56 max-h-56 w-full"
src={`/api/assets/${bookmarkedAsset.assetId}`}
/>
)}
<div className="flex flex-col gap-y-1 overflow-hidden p-2">
<div className="flex h-full flex-wrap gap-1 overflow-hidden">
<TagList
Expand Down
39 changes: 22 additions & 17 deletions apps/web/components/dashboard/preview/AssetContentSection.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,30 @@ export function AssetContentSection({ bookmark }: { bookmark: ZBookmark }) {
throw new Error("Invalid content type");
}

let content;
switch (bookmark.content.assetType) {
case "image": {
switch (bookmark.content.assetType) {
case "image": {
content = (
<div className="relative h-full min-w-full">
<Image
alt="asset"
fill={true}
className="object-contain"
src={`/api/assets/${bookmark.content.assetId}`}
/>
</div>
);
}
}
break;
return (
<div className="relative h-full min-w-full">
<Image
alt="asset"
fill={true}
className="object-contain"
src={`/api/assets/${bookmark.content.assetId}`}
/>
</div>
);
}
case "pdf": {
return (
<iframe
title={bookmark.content.assetId}
className="h-full w-full"
src={`/api/assets/${bookmark.content.assetId}`}
/>
);
}
default: {
return <div>Unsupported asset type</div>;
}
}
return content;
}
69 changes: 57 additions & 12 deletions apps/workers/openaiWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ import { z } from "zod";

import type { ZOpenAIRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
import { bookmarks, bookmarkTags, tagsOnBookmarks } from "@hoarder/db/schema";
import {
bookmarkAssets,
bookmarks,
bookmarkTags,
tagsOnBookmarks,
} from "@hoarder/db/schema";
import { readAsset } from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
import logger from "@hoarder/shared/logger";
Expand All @@ -18,6 +23,7 @@ import {

import type { InferenceClient } from "./inference";
import { InferenceClientFactory } from "./inference";
import { readPDFText, truncateContent } from "./utils";

const openAIResponseSchema = z.object({
tags: z.array(z.string()),
Expand Down Expand Up @@ -91,14 +97,6 @@ CONTENT START HERE:
function buildPrompt(
bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
) {
const truncateContent = (content: string) => {
let words = content.split(" ");
if (words.length > 1500) {
words = words.slice(1500);
content = words.join(" ");
}
return content;
};
if (bookmark.link) {
if (!bookmark.link.description && !bookmark.link.content) {
throw new Error(
Expand Down Expand Up @@ -158,14 +156,48 @@ async function inferTagsFromImage(
);
}
const base64 = asset.toString("base64");

return await inferenceClient.inferFromImage(
return inferenceClient.inferFromImage(
IMAGE_PROMPT_BASE,
metadata.contentType,
base64,
);
}

async function inferTagsFromPDF(
jobId: string,
bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
inferenceClient: InferenceClient,
) {
const { asset } = await readAsset({
userId: bookmark.userId,
assetId: bookmark.asset.assetId,
});
if (!asset) {
throw new Error(
`[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`,
);
}
const pdfParse = await readPDFText(asset);
if (!pdfParse?.text) {
throw new Error(
`[inference][${jobId}] PDF text is empty. Please make sure that the PDF includes text and not just images.`,
);
}

await db
.update(bookmarkAssets)
.set({
content: pdfParse.text,
metadata: pdfParse.metadata ? JSON.stringify(pdfParse.metadata) : null,
})
.where(eq(bookmarkAssets.id, bookmark.id));

const prompt = `${TEXT_PROMPT_BASE}
Content: ${truncateContent(pdfParse.text)}
`;
return inferenceClient.inferFromText(prompt);
}

async function inferTagsFromText(
bookmark: NonNullable<Awaited<ReturnType<typeof fetchBookmark>>>,
inferenceClient: InferenceClient,
Expand All @@ -182,11 +214,24 @@ async function inferTags(
if (bookmark.link || bookmark.text) {
response = await inferTagsFromText(bookmark, inferenceClient);
} else if (bookmark.asset) {
response = await inferTagsFromImage(jobId, bookmark, inferenceClient);
switch (bookmark.asset.assetType) {
case "image":
response = await inferTagsFromImage(jobId, bookmark, inferenceClient);
break;
case "pdf":
response = await inferTagsFromPDF(jobId, bookmark, inferenceClient);
break;
default:
throw new Error(`[inference][${jobId}] Unsupported bookmark type`);
}
} else {
throw new Error(`[inference][${jobId}] Unsupported bookmark type`);
}

if (!response) {
throw new Error(`[inference][${jobId}] Inference response is empty`);
}

try {
let tags = openAIResponseSchema.parse(JSON.parse(response.response)).tags;
logger.info(
Expand Down
2 changes: 2 additions & 0 deletions apps/workers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
"metascraper-url": "^5.43.4",
"ollama": "^0.5.0",
"openai": "^4.29.0",
"pdf2json": "^3.0.5",
"pdfjs-dist": "^4.0.379",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-adblocker": "^2.13.6",
Expand Down
7 changes: 7 additions & 0 deletions apps/workers/searchWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ async function runIndex(
with: {
link: true,
text: true,
asset: true,
tagsOnBookmarks: {
with: {
tag: true,
Expand All @@ -72,6 +73,12 @@ async function runIndex(
content: bookmark.link.content,
}
: undefined),
...(bookmark.asset
? {
content: bookmark.asset.content,
metadata: bookmark.asset.metadata,
}
: undefined),
...(bookmark.text ? { content: bookmark.text.text } : undefined),
note: bookmark.note,
createdAt: bookmark.createdAt.toISOString(),
Expand Down
32 changes: 32 additions & 0 deletions apps/workers/utils.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import PDFParser from "pdf2json";

export function withTimeout<T, Ret>(
func: (param: T) => Promise<Ret>,
timeoutSec: number,
Expand All @@ -14,3 +16,33 @@ export function withTimeout<T, Ret>(
]);
};
}

export async function readPDFText(buffer: Buffer): Promise<{
text: string;
metadata: Record<string, string>;
}> {
return new Promise((resolve, reject) => {
// Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265
const pdfParser = new PDFParser(null, 1);
pdfParser.on("pdfParser_dataError", reject);
pdfParser.on("pdfParser_dataReady", (pdfData) => {
// eslint-disable-next-line
resolve({
// The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
// eslint-disable-next-line
text: (pdfParser as any).getRawTextContent(),
metadata: pdfData.Meta,
});
});
pdfParser.parseBuffer(buffer);
});
}

export function truncateContent(content: string, length = 1500) {
let words = content.split(" ");
if (words.length > length) {
words = words.slice(length);
content = words.join(" ");
}
return content;
}
3 changes: 3 additions & 0 deletions packages/db/drizzle/0015_first_reavers.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ALTER TABLE bookmarkAssets ADD `content` text;--> statement-breakpoint
ALTER TABLE bookmarkAssets ADD `metadata` text;--> statement-breakpoint
ALTER TABLE bookmarkAssets ADD `info` text;
2 changes: 2 additions & 0 deletions packages/db/drizzle/0016_shallow_rawhide_kid.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE bookmarkAssets ADD `fileName` text;--> statement-breakpoint
ALTER TABLE `bookmarkAssets` DROP COLUMN `info`;
Loading
Loading