feature: Add OCR support for images. Fixes #296

hoarder-app · Oct 20, 2024 · 019b5d2 · 019b5d2
1 parent f793646
commit 019b5d2
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ A self-hostable bookmark-everything app with a touch of AI for the data hoarders
 - 📋 Sort your bookmarks into lists.
 - 🔎 Full text search of all the content stored.
 - ✨ AI-based (aka chatgpt) automatic tagging. With supports for local models using ollama!
+- 🎆 OCR for extracting text from images.
 - 🔖 [Chrome plugin](https://chromewebstore.google.com/detail/hoarder/kgcjekpmcjjogibpjebkhaanilehneje) and [Firefox addon](https://addons.mozilla.org/en-US/firefox/addon/hoarder/) for quick bookmarking.
 - 📱 An [iOS app](https://apps.apple.com/us/app/hoarder-app/id6479258022), and an [Android app](https://play.google.com/store/apps/details?id=app.hoarder.hoardermobile&pcampaignid=web_share).
 - 🗄️ Full page archival (using [monolith](https://github.com/Y2Z/monolith)) to protect against link rot.

diff --git a/apps/workers/openaiWorker.ts b/apps/workers/openaiWorker.ts
@@ -23,7 +23,7 @@ import {
 
 import type { InferenceClient } from "./inference";
 import { InferenceClientFactory } from "./inference";
-import { readPDFText } from "./utils";
+import { readImageText, readPDFText } from "./utils";
 
 const openAIResponseSchema = z.object({
   tags: z.array(z.string()),
@@ -152,6 +152,26 @@ async function inferTagsFromImage(
       `[inference][${jobId}] AssetId ${bookmark.asset.assetId} for bookmark ${bookmark.id} not found`,
     );
   }
+
+  let imageText = null;
+  try {
+    imageText = await readImageText(asset);
+  } catch (e) {
+    logger.error(`[inference][${jobId}] Failed to read image text: ${e}`);
+  }
+
+  if (imageText) {
+    logger.info(
+      `[inference][${jobId}] Extracted ${imageText.length} characters from image.`,
+    );
+    await db
+      .update(bookmarkAssets)
+      .set({
+        content: imageText,
+      })
+      .where(eq(bookmarkAssets.id, bookmark.id));
+  }
+
   const base64 = asset.toString("base64");
   return inferenceClient.inferFromImage(
     buildImagePrompt(

diff --git a/apps/workers/package.json b/apps/workers/package.json
@@ -34,6 +34,7 @@
     "puppeteer-extra": "^3.3.6",
     "puppeteer-extra-plugin-adblocker": "^2.13.6",
     "puppeteer-extra-plugin-stealth": "^2.11.2",
+    "tesseract.js": "^5.1.1",
     "tsx": "^4.7.1",
     "typescript": "^5.3.3",
     "zod": "^3.22.4"

diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts
@@ -1,4 +1,8 @@
+import os from "os";
 import PDFParser from "pdf2json";
+import { createWorker } from "tesseract.js";
+
+import serverConfig from "@hoarder/shared/config";
 
 export function withTimeout<T, Ret>(
   func: (param: T) => Promise<Ret>,
@@ -17,6 +21,24 @@ export function withTimeout<T, Ret>(
   };
 }
 
+export async function readImageText(buffer: Buffer) {
+  if (serverConfig.ocr.langs.length == 1 && serverConfig.ocr.langs[0] == "") {
+    return null;
+  }
+  const worker = await createWorker(serverConfig.ocr.langs, undefined, {
+    cachePath: serverConfig.ocr.cacheDir ?? os.tmpdir(),
+  });
+  try {
+    const ret = await worker.recognize(buffer);
+    if (ret.data.confidence <= serverConfig.ocr.confidenceThreshold) {
+      return null;
+    }
+    return ret.data.text;
+  } finally {
+    await worker.terminate();
+  }
+}
+
 export async function readPDFText(buffer: Buffer): Promise<{
   text: string;
   metadata: Record<string, string>;

diff --git a/docs/docs/01-intro.md b/docs/docs/01-intro.md
@@ -16,6 +16,7 @@ Hoarder is an open source "Bookmark Everything" app that uses AI for automatical
 - 📋 Sort your bookmarks into lists.
 - 🔎 Full text search of all the content stored.
 - ✨ AI-based (aka chatgpt) automatic tagging. With supports for local models using ollama!
+- 🎆 OCR for extracting text from images.
 - 🔖 [Chrome plugin](https://chromewebstore.google.com/detail/hoarder/kgcjekpmcjjogibpjebkhaanilehneje) and [Firefox addon](https://addons.mozilla.org/en-US/firefox/addon/hoarder/) for quick bookmarking.
 - 📱 An [iOS app](https://apps.apple.com/us/app/hoarder-app/id6479258022), and an [Android app](https://play.google.com/store/apps/details?id=app.hoarder.hoardermobile&pcampaignid=web_share).
 - 🗄️ Full page archival (using [monolith](https://github.com/Y2Z/monolith)) to protect against link rot.

diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md
@@ -26,7 +26,7 @@ When setting up OAuth, the allowed redirect URLs configured at the provider shou
 :::
 
 | Name                                        | Required | Default                | Description                                                                                                                                                         |
-| ------------------------------------------- | -------- | ---------------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| ------------------------------------------- | -------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | DISABLE_SIGNUPS                             | No       | false                  | If enabled, no new signups will be allowed and the signup button will be disabled in the UI                                                                         |
 | DISABLE_PASSWORD_AUTH                       | No       | false                  | If enabled, only signups and logins using OAuth are allowed and the signup button and login form for local accounts will be disabled in the UI                      |
 | OAUTH_WELLKNOWN_URL                         | No       | Not set                | The "wellknown Url" for openid-configuration as provided by the OAuth provider                                                                                      |
@@ -74,3 +74,13 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
 | CRAWLER_FULL_PAGE_ARCHIVE     | No       | false   | Whether to store a full local copy of the page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, only the readable text of the page is archived.                                                                                                                                                                                                 |
 | CRAWLER_JOB_TIMEOUT_SEC       | No       | 60      | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit                                                                                                                                                                                                             |
 | CRAWLER_NAVIGATE_TIMEOUT_SEC  | No       | 30      | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection                                                                                                                                                                                                                                                          |
+
+## OCR Configs
+
+Hoarder uses [tesseract.js](https://github.com/naptha/tesseract.js) to extract text from images.
+
+| Name                     | Required | Default   | Description                                                                                                                                                                                                                               |
+| ------------------------ | -------- | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| OCR_CACHE_DIR            | No       | $TEMP_DIR | The dir where tesseract will download its models. By default, those models are not persisted and stored in the OS' temp dir.                                                                                                              |
+| OCR_LANGS                | No       | eng       | Comma separated list of the language codes that you want tesseract to support. You can find the language codes [here](https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). Set to empty string to disable OCR. |
+| OCR_CONFIDENCE_THRESHOLD | No       | 50        | A number between 0 and 100 indicating the minimum acceptable confidence from tessaract. If tessaract's confidence is lower than this value, extracted text won't be stored.                                                               |
diff --git a/packages/shared/config.ts b/packages/shared/config.ts
@@ -25,6 +25,12 @@ const allEnv = z.object({
   INFERENCE_TEXT_MODEL: z.string().default("gpt-4o-mini"),
   INFERENCE_IMAGE_MODEL: z.string().default("gpt-4o-mini"),
   INFERENCE_CONTEXT_LENGTH: z.coerce.number().default(2048),
+  OCR_CACHE_DIR: z.string().optional(),
+  OCR_LANGS: z
+    .string()
+    .default("eng")
+    .transform((val) => val.split(",")),
+  OCR_CONFIDENCE_THRESHOLD: z.coerce.number().default(50),
   CRAWLER_HEADLESS_BROWSER: stringBool("true"),
   BROWSER_WEB_URL: z.string().url().optional(),
   BROWSER_WEBSOCKET_URL: z.string().url().optional(),
@@ -90,6 +96,11 @@ const serverConfigSchema = allEnv.transform((val) => {
       fullPageScreenshot: val.CRAWLER_FULL_PAGE_SCREENSHOT,
       fullPageArchive: val.CRAWLER_FULL_PAGE_ARCHIVE,
     },
+    ocr: {
+      langs: val.OCR_LANGS,
+      cacheDir: val.OCR_CACHE_DIR,
+      confidenceThreshold: val.OCR_CONFIDENCE_THRESHOLD,
+    },
     meilisearch: val.MEILI_ADDR
       ? {
           address: val.MEILI_ADDR,

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml