Skip to content

Commit

Permalink
Update Tesseract (to 4.0!), PDFBox
Browse files Browse the repository at this point in the history
  • Loading branch information
adamhooper committed Jan 25, 2019
1 parent 42fee7d commit 0268f60
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 12 deletions.
12 changes: 6 additions & 6 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

name := "pdfocr"

version := "0.0.10"
version := "0.0.11"

homepage := Some(url("https://github.com/overview/pdfocr"))

Expand All @@ -21,18 +21,18 @@ fork in (Compile, run) := true // Main calls System.exit() to give proper return

// Compile settings

scalaVersion := "2.12.6"
scalaVersion := "2.12.8"

scalacOptions += "-deprecation"

libraryDependencies ++= Seq(
"org.bouncycastle" % "bcmail-jdk15on" % "1.59", // https://pdfbox.apache.org/2.0/dependencies.html
"org.bouncycastle" % "bcprov-jdk15on" % "1.59", // https://pdfbox.apache.org/2.0/dependencies.html
"org.bouncycastle" % "bcpkix-jdk15on" % "1.59", // https://pdfbox.apache.org/2.0/dependencies.html
"org.bouncycastle" % "bcmail-jdk15on" % "1.60", // https://pdfbox.apache.org/2.0/dependencies.html
"org.bouncycastle" % "bcprov-jdk15on" % "1.60", // https://pdfbox.apache.org/2.0/dependencies.html
"org.bouncycastle" % "bcpkix-jdk15on" % "1.60", // https://pdfbox.apache.org/2.0/dependencies.html
"com.github.jai-imageio" % "jai-imageio-core" % "1.4.0", // for TIFF support
"com.github.jai-imageio" % "jai-imageio-jpeg2000" % "1.3.0", // for JPEG2000 support
"org.apache.pdfbox" % "jbig2-imageio" % "3.0.0",
"org.apache.pdfbox" % "pdfbox" % "2.0.9",
"org.apache.pdfbox" % "pdfbox" % "2.0.13",
"org.scalatest" %% "scalatest" % "3.0.5" % "test",
"org.mockito" % "mockito-core" % "2.18.3" % "test",
"org.slf4j" % "jcl-over-slf4j" % "1.7.25" % "test", // So we can mute warnings during testing
Expand Down
5 changes: 3 additions & 2 deletions src/main/scala/org/overviewproject/pdfocr/ocr/Tesseract.scala
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,10 @@ class Tesseract(val options: TesseractOptions) {
try {
val processBuilder = new ProcessBuilder(
options.tesseractPath,
"-", "-",
"-l", languages.map(_.getISO3Language).mkString("+"), // Languages
"-psm", "1", // Page segmentation + orientation/script detection
"--psm", "1", // Page segmentation + orientation/script detection
"--oem", "1", // LTSM engine (Tesseract 4.0)
"-", "-", // stdin, stdout
"hocr"
)
processBuilder.start
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions src/test/resources/fake-tesseract
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
#
# Spits out its invocation to stderr, then spits out its input to stdout.
#
# Special language values ($4) cause special results:
# Special language values ($2) cause special results:
#
# * "zxx": retval 1, missing-language warning to stderr
# * "osd": retval 0, missing-language warning to stderr
# * "und": retval 1, other error message


case "$4" in
case "$2" in
osd)
# Put dummy data above and below the language message we grep for
echo "blah" >&2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ class TesseractSpec extends UnitSpec {

it("shells to Tesseract and collects stderr") {
val result = tesseract.ocr(image, Seq(new Locale("en"))).futureValue
new String(result.standardError, "utf-8") must equal("- - -l eng -psm 1 hocr\n")
new String(result.standardError, "utf-8") must equal("-l eng --psm 1 --oem 1 - - hocr\n")
}

it("concatenates languages using +") {
val result = tesseract.ocr(image, Seq(new Locale("en"), new Locale("fr"))).futureValue
new String(result.standardError, "utf-8") must equal("- - -l eng+fra -psm 1 hocr\n")
new String(result.standardError, "utf-8") must equal("-l eng+fra --psm 1 --oem 1 - - hocr\n")
}

it("sends Tesseract the image as BMP") {
Expand Down

0 comments on commit 0268f60

Please sign in to comment.