Skip to content

Commit

Permalink
feat: recompile for Node 10.x runtime
Browse files Browse the repository at this point in the history
  • Loading branch information
vladholubiev committed Jun 13, 2019
1 parent 2374943 commit 70d5d01
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 48 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: 2
jobs:
build:
docker:
- image: circleci/node:8
- image: circleci/node:10

working_directory: ~/repo

Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ node_modules/
*.log
.DS_Store
yarn.lock
*.tar.gz
bin/tesseract-standalone/
Binary file removed bin/tt.tar.br
Binary file not shown.
Binary file added bin/tt.tar.gz
Binary file not shown.
44 changes: 21 additions & 23 deletions compile-tesseract.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
#!/usr/bin/env bash
# Spin up and enter the docker container on your machine with the following command:
# docker run -it lambci/lambda:build-nodejs10.x bash

# install basic stuff required for compilation
sudo yum-config-manager --enable epel

sudo yum install -y aclocal autoconf automake cmakegcc freetype-devel gcc gcc-c++ \
# Then run the rest of the commands inside

# install basic stuff required for compilation
yum install -y aclocal autoconf automake cmakegcc freetype-devel gcc gcc-c++ \
git lcms2-devel libjpeg-devel libjpeg-turbo-devel autogen autoconf libtool \
libpng-devel libtiff-devel libtool libwebp-devel libzip-devel make zlib-devel
sudo yum groupinstall "Development Tools" -y

# autoconf
cd ~
wget http://babyname.tips/mirrors/gnu/autoconf-archive/autoconf-archive-2017.09.28.tar.xz
tar -xvf autoconf-archive-2017.09.28.tar.xz
cd autoconf-archive-2017.09.28
./configure && make && sudo make install
sudo cp m4/* /usr/share/aclocal/cd ~ wget http://babynam

# leptonica
cd ~
Expand All @@ -23,29 +16,31 @@ cd leptonica/
./autogen.sh
./configure
make
sudo make install
make install

# tesseract
cd ~
git clone https://github.com/tesseract-ocr/tesseract.git
cd tesseract
git checkout 4.0.0
export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
./autogen.sh
./configure
make
sudo make install
make install

cd ~
mkdir tesseract-standalone

# trim unneeded ~ 15 MB
strip ./tesseract-standalone/**/*

# copy files
cd tesseract-standalone
cp /usr/local/bin/tesseract .
mkdir lib
cp /usr/local/lib/libtesseract.so.4 lib/
cp /lib64/libpng15.so.15 lib/
cp /lib64/libtiff.so.5 lib/
cp /lib64/libgomp.so.1 lib/
cp /lib64/libjbig.so.2.0 lib/
cp /usr/local/lib/liblept.so.5 lib/
cp /usr/lib64/libjpeg.so.62 lib/
cp /usr/lib64/libwebp.so.4 lib/
Expand All @@ -54,13 +49,16 @@ cp /usr/lib64/libstdc++.so.6 lib/
# copy training data
mkdir tessdata
cd tessdata
wget https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata
curl -L https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata --output eng.traineddata

# archive
cd ~
tar -zcvf tesseract.tar.gz tesseract-standalone

# download from EC2 to local machine
scp [email protected]:/home/ec2-user/tesseract.tar.gz $(pwd)
# trim unneeded ~ 15 MB
strip ./tesseract-standalone/**/*

tar -zcvf tesseract.tar.gz tesseract-standalone

# run compress-with-brotli.sh on local machine now
# download from docker to local machine
# 21c27dc1bf5d is docker container id, you can look it up by running "docker ps"
docker cp 21c27dc1bf5d:/root/tesseract.tar.gz tt.tar.gz
4 changes: 0 additions & 4 deletions compress-with-brotli.sh

This file was deleted.

11 changes: 5 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@shelf/aws-lambda-tesseract",
"version": "1.3.2",
"description": "11 MB Tesseract (with English training data) to fit inside AWS Lambda compressed with Brotli",
"description": "6 MB Tesseract (with English training data) to fit inside AWS Lambda",
"license": "MIT",
"repository": "shelfio/aws-lambda-tesseract",
"author": {
Expand All @@ -10,7 +10,7 @@
"url": "shelf.io"
},
"engines": {
"node": ">=8.10"
"node": ">=10"
},
"scripts": {
"lint": "eslint . --fix",
Expand All @@ -24,12 +24,11 @@
"keywords": [
"lambda",
"ocr",
"tesseract",
"brotli"
"tesseract"
],
"dependencies": {
"@shelf/aws-lambda-brotli-unpacker": "0.0.2",
"is-image": "3.0.0"
"is-image": "3.0.0",
"tar": "4.4.10"
},
"devDependencies": {
"@shelf/eslint-config": "0.3.5",
Expand Down
13 changes: 8 additions & 5 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# aws-lambda-tesseract [![CircleCI](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master.svg?style=svg)](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master) ![](https://img.shields.io/badge/code_style-prettier-ff69b4.svg) [![Tesseract](https://img.shields.io/badge/tesserract-11_MB-brightgreen.svg)](bin/)
# aws-lambda-tesseract [![CircleCI](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master.svg?style=svg)](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master) ![](https://img.shields.io/badge/code_style-prettier-ff69b4.svg) [![Tesseract](https://img.shields.io/badge/tesserract-6_MB-brightgreen.svg)](bin/)

> 11 MB Tesseract (with English training data) to fit inside AWS Lambda compressed with Brotli
> 6 MB Tesseract (with English training data) to fit inside AWS Lambda
Inspired by [chrome-aws-lambda](https://github.com/alixaxel/chrome-aws-lambda) & [lambda-scanner-ocr](https://github.com/philippkeller/lambda-scanner-ocr)

Expand All @@ -10,9 +10,13 @@ Inspired by [chrome-aws-lambda](https://github.com/alixaxel/chrome-aws-lambda) &
$ yarn add @shelf/aws-lambda-tesseract
```

`1.x` versions of this library were compiled for Node 8.10.

`2.x` was compiled for Node 10.x runtime.

## How does it work?

This package contains an archive with [Tesseract 4.0 beta](https://github.com/tesseract-ocr/tesseract) compiled for usage in AWS Lambda environment.
This package contains an archive with [Tesseract 4.0](https://github.com/tesseract-ocr/tesseract) compiled for usage in AWS Lambda environment.

When a Lambda starts, it unpacks an archive with a binary to the `/tmp` folder and makes sure it's done only once per Lambda cold start.

Expand All @@ -38,14 +42,13 @@ unsupported by Tesseract file extensions.

## Compile It Yourself

See [compile-tesseract.sh](compile-tesseract.sh) & [compress-with-brotli.sh](compress-with-brotli.sh) files
See [compile-tesseract.sh](compile-tesseract.sh)

Smoke test that it works by running `test.sh` script

## See Also

- [aws-lambda-libreoffice](https://github.com/shelfio/aws-lambda-libreoffice)
- [aws-lambda-brotli-unpacker](https://github.com/shelfio/aws-lambda-brotli-unpacker)
- [chrome-aws-lambda](https://github.com/alixaxel/chrome-aws-lambda)

## License
Expand Down
16 changes: 9 additions & 7 deletions src/index.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
const {unpack} = require('@shelf/aws-lambda-brotli-unpacker');
const {extract} = require('tar');
const {execFileSync, execSync} = require('child_process');
const path = require('path');
const isImage = require('is-image');

const unsupportedExtensions = new Set(['ai', 'emf', 'eps', 'gif', 'ico', 'psd', 'svg']);
const inputPath = path.join(__dirname, '..', 'bin', 'tt.tar.br');
const outputPath = '/tmp/tesseract/tesseract';
const inputPath = path.join(__dirname, '..', 'bin', 'tt.tar.gz');
const outputPath = '/tmp/tesseract-standalone/tesseract';

module.exports.getExecutablePath = async function() {
return unpack({inputPath, outputPath});
await extract({file: inputPath, cwd: '/tmp'});

return outputPath;
};

module.exports.getTextFromImage = async function(filePath) {
const ttBinary = await unpack({inputPath, outputPath});
await extract({file: inputPath, cwd: '/tmp'});

const stdout = execFileSync(ttBinary, [filePath, 'stdout', '-l', 'eng'], {
cwd: '/tmp/tesseract',
const stdout = execFileSync(outputPath, [filePath, 'stdout', '-l', 'eng'], {
cwd: '/tmp/tesseract-standalone',
env: {
LD_LIBRARY_PATH: './lib',
TESSDATA_PREFIX: './tessdata'
Expand Down
6 changes: 5 additions & 1 deletion test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#!/usr/bin/env bash

cd bin
tar -xvzf tt.tar.gz
cd ..

docker run --rm \
-v "$PWD":/var/task \
lambci/lambda:nodejs8.10 test.handler
lambci/lambda:nodejs10.x test.handler

0 comments on commit 70d5d01

Please sign in to comment.