ropensci · pachadotdev · Aug 3, 2024 · Aug 3, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,10 +2,8 @@
 ^\.Rproj\.user$
 ^src/Makevars$
 ^windows
-\.pdf$
-\.png$
+^vignettes/.*\.png$
 \.webp$
-\.jpeg$
 \.o$
 \.dll$
 ^\.travis\.yml$
@@ -14,3 +12,4 @@
 vignettes/.*\.png$
 ^configure.log$
 ^\.github$
+^\.vscode$
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -23,14 +23,15 @@ jobs:
           - {os: windows-latest,    r: 'devel'}
           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
-          - {os: ubuntu-20.04,    r: 'release'}
+          - {os: ubuntu-20.04,    r: 'release', tesseract_version: '4'}
+          - {os: ubuntu-20.04,    r: 'release', tesseract_version: '5'}
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/setup-pandoc@v2
 
@@ -40,6 +41,13 @@ jobs:
           http-user-agent: ${{ matrix.config.http-user-agent }}
           use-public-rspm: true
 
+      - name: Install Tesseract 5
+        if: matrix.config.os == 'ubuntu-20.04' && matrix.config.tesseract_version == '5'
+        run: |
+          sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
+          sudo apt-get update
+          sudo apt-get install -y libtesseract-dev tesseract-ocr tesseract-ocr-eng
+
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
           extra-packages: rcmdcheck

diff --git a/.gitignore b/.gitignore
@@ -2,14 +2,14 @@
 *.so
 *.dll
 *.a
-*.txt
-*.pdf
-*.png
 *.webp
 *.jpeg
+vignettes/*.png
 .Rproj.user
 .Rhistory
 inst/tessdata
 windows
 src/Makevars
 configure.log
+\.vscode
+README.html
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,28 +1,38 @@
-Package: tesseract
+Package: cpp11tesseract
 Type: Package
 Title: Open Source OCR Engine
-Version: 5.2.1
-Authors@R: person("Jeroen", "Ooms", role = c("aut", "cre"), email = "[email protected]",
-    comment = c(ORCID = "0000-0002-4035-0289"))
+Version: 5.3.0
+Authors@R: c(person("Jeroen", "Ooms",
+                    role = c("aut", "cre"),
+                    email = "[email protected]",
+                    comment = c(ORCID = "0000-0002-4035-0289")),
+             person("Mauricio", "Vargas Sepulveda",
+                    role = "aut", 
+                    email = "[email protected]", 
+                    comment = c(ORCID = "0000-0003-1017-7574")),
+             person("Munk School of Global Affairs and Public Policy",
+                    role = "fnd")
+            )
 Description: Bindings to 'Tesseract': 
-     a powerful optical character recognition (OCR) engine that supports over 100 languages.
-     The engine is highly configurable in order to tune the detection algorithms and
-     obtain the best possible results.
+     a powerful optical character recognition (OCR) engine that supports over
+     100 languages. The engine is highly configurable in order to tune the
+     detection algorithms and obtain the best possible results.
 License: Apache License 2.0
 URL: https://docs.ropensci.org/tesseract/ (website) 
     https://github.com/ropensci/tesseract (devel)
 BugReports: https://github.com/ropensci/tesseract/issues
-SystemRequirements: Tesseract >= 3.03 (libtesseract-dev / tesseract-devel) and
-    Leptonica (libleptonica-dev / leptonica-devel). On Debian you need to install
-    the English training data separately (tesseract-ocr-eng)
+SystemRequirements: Tesseract >= 4.0.0 (libtesseract-dev / tesseract-devel) and
+    Leptonica (libleptonica-dev / leptonica-devel). On Debian you need to
+    install the English and other languages training data separately
+    (e.g. tesseract-ocr-eng or tesseract-ocr-spa).
 Imports:
-    Rcpp (>= 0.12.12),
     pdftools (>= 1.5),    
     curl,
     rappdirs,
     digest
-LinkingTo: Rcpp
-RoxygenNote: 7.3.2
+LinkingTo: 
+    cpp11
+RoxygenNote: 7.3.1
 Roxygen: list(markdown = TRUE)
 Suggests:
     magick (>= 1.7),

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,5 +7,4 @@ export(tesseract)
 export(tesseract_download)
 export(tesseract_info)
 export(tesseract_params)
-importFrom(Rcpp,sourceCpp)
-useDynLib(tesseract)
+useDynLib(cpp11tesseract, .registration = TRUE)
diff --git a/R/RcppExports.R b/R/RcppExports.R
diff --git a/R/cpp11.R b/R/cpp11.R
@@ -0,0 +1,49 @@
+# Generated by cpp11: do not edit by hand
+
+tesseract_major_version <- function() {
+  .Call(`_cpp11tesseract_tesseract_major_version`)
+}
+
+tesseract_config <- function() {
+  .Call(`_cpp11tesseract_tesseract_config`)
+}
+
+tesseract_engine_internal <- function(datapath, language, confpaths, opt_names, opt_values) {
+  .Call(`_cpp11tesseract_tesseract_engine_internal`, datapath, language, confpaths, opt_names, opt_values)
+}
+
+tesseract_engine_set_variable <- function(ptr, name, value) {
+  .Call(`_cpp11tesseract_tesseract_engine_set_variable`, ptr, name, value)
+}
+
+validate_params <- function(params) {
+  .Call(`_cpp11tesseract_validate_params`, params)
+}
+
+engine_info_internal <- function(ptr) {
+  .Call(`_cpp11tesseract_engine_info_internal`, ptr)
+}
+
+print_params <- function(filename) {
+  .Call(`_cpp11tesseract_print_params`, filename)
+}
+
+get_param_values <- function(api, params) {
+  .Call(`_cpp11tesseract_get_param_values`, api, params)
+}
+
+ocr_raw <- function(input, ptr, HOCR) {
+  .Call(`_cpp11tesseract_ocr_raw`, input, ptr, HOCR)
+}
+
+ocr_file <- function(file, ptr, HOCR) {
+  .Call(`_cpp11tesseract_ocr_file`, file, ptr, HOCR)
+}
+
+ocr_raw_data <- function(input, ptr) {
+  .Call(`_cpp11tesseract_ocr_raw_data`, input, ptr)
+}
+
+ocr_file_data <- function(file, ptr) {
+  .Call(`_cpp11tesseract_ocr_file_data`, file, ptr)
+}
diff --git a/R/cpp11tesseract-package.R b/R/cpp11tesseract-package.R
@@ -0,0 +1,11 @@
+#' @title Open Source OCR Engine
+#'
+#' @description
+#' Bindings to 'Tesseract':
+#' a powerful optical character recognition (OCR) engine that supports over 100
+#' languages. The engine is highly configurable in order to tune the detection
+#' algorithms and obtain the best possible results.
+#'
+#' @name cpp11tesseract-package
+#' @useDynLib cpp11tesseract, .registration = TRUE
+"_PACKAGE"
diff --git a/R/ocr.R b/R/ocr.R
@@ -2,31 +2,30 @@
 #'
 #' Extract text from an image. Requires that you have training data for the language you
 #' are reading. Works best for images with high contrast, little noise and horizontal text.
-#' See [tesseract wiki](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality) and
+#' See [tesseract wiki](https://github.com/tesseract-ocr/tessdoc) and
 #' our package vignette for image preprocessing tips.
 #'
 #' The `ocr()` function returns plain text by default, or hOCR text if hOCR is set to `TRUE`.
 #' The `ocr_data()` function returns a data frame with a confidence rate and bounding box for
 #' each word in the text.
 #'
 #' @export
-#' @useDynLib tesseract
 #' @family tesseract
 #' @param image file path, url, or raw vector to image (png, tiff, jpeg, etc)
 #' @param engine a tesseract engine created with [tesseract()]. Alternatively a
 #' language string which will be passed to [tesseract()].
 #' @param HOCR if `TRUE` return results as HOCR xml instead of plain text
 #' @rdname ocr
 #' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
-#' @importFrom Rcpp sourceCpp
 #' @examples # Simple example
-#' text <- ocr("https://jeroen.github.io/images/testocr.png")
+#' file <- system.file("examples", "testocr.png", package = "cpp11tesseract")
+#' text <- ocr(file)
 #' cat(text)
 #'
-#' xml <- ocr("https://jeroen.github.io/images/testocr.png", HOCR = TRUE)
+#' xml <- ocr(file, HOCR = TRUE)
 #' cat(xml)
 #'
-#' df <- ocr_data("https://jeroen.github.io/images/testocr.png")
+#' df <- ocr_data(file)
 #' print(df)
 #'
 #' \donttest{
@@ -35,7 +34,7 @@
 #' orig <- pdftools::pdf_text("R-intro.pdf")[1]
 #'
 #' # Render pdf to png image
-#' img_file <- pdftools::pdf_convert("R-intro.pdf", format = 'tiff', pages = 1, dpi = 400)
+#' img_file <- pdftools::pdf_convert("R-intro.pdf", format = "tiff", pages = 1, dpi = 400)
 #' unlink("R-intro.pdf")
 #'
 #' # Extract text from png image
@@ -48,7 +47,7 @@
 ocr <- function(image, engine = tesseract("eng"), HOCR = FALSE) {
   if(is.character(engine))
     engine <- tesseract(engine)
-  stopifnot(inherits(engine, "tesseract"))
+  stopifnot(inherits(engine, "externalptr"))
   if(inherits(image, "magick-image")){
     vapply(image, function(x){
       tmp <- tempfile(fileext = ".png")
@@ -71,7 +70,7 @@ ocr <- function(image, engine = tesseract("eng"), HOCR = FALSE) {
 ocr_data <- function(image, engine = tesseract("eng")) {
   if(is.character(engine))
     engine <- tesseract(engine)
-  stopifnot(inherits(engine, "tesseract"))
+  stopifnot(inherits(engine, "externalptr"))
   df_list <- if(inherits(image, "magick-image")){
     lapply(image, function(x){
       tmp <- tempfile(fileext = ".png")

diff --git a/R/onload.R b/R/onload.R
@@ -60,7 +60,7 @@ check_training_data <- function(){
   tryCatch(tesseract(), error = function(e){
     warning("Unable to find English training data", call. = FALSE)
     os <- utils::sessionInfo()$running
-    if(isTRUE(grepl("ubuntu|debian", os, TRUE))){
+    if (isTRUE(grepl("ubuntu|debian|pop", os, TRUE))) {
       stop("DEBIAN / UBUNTU: Please run: apt-get install tesseract-ocr-eng")
     }
   })

diff --git a/R/tessdata.R b/R/tessdata.R
@@ -28,9 +28,10 @@
 #' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
 #' @examples \dontrun{
 #' if(is.na(match("fra", tesseract_info()$available)))
-#'   tesseract_download("fra", model = 'best')
+#'   tesseract_download("fra", model = "best")
 #' french <- tesseract("fra")
-#' text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french)
+#' file <- system.file("examples", "french.png", package = "cpp11tesseract")
+#' text <- ocr(file, engine = french)
 #' cat(text)
 #' }
 tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) {

diff --git a/R/tesseract.R b/R/tesseract.R
@@ -27,11 +27,11 @@ tesseract <- local({
     language <- as.character(language)
     configs <- as.character(configs)
     options <- as.list(options)
-    if(isTRUE(cache)){
+    if(isTRUE(cache)) {
       key <- digest::digest(list(language, datapath, configs, options))
       if(is.null(store[[key]])){
         ptr <- tesseract_engine(datapath, language, configs, options)
-        assign(key, ptr, store);
+        assign(key, ptr, store)
       }
       store[[key]]
     } else {
@@ -43,7 +43,7 @@ tesseract <- local({
 #' @export
 #' @rdname tesseract
 #' @param filter only list parameters containing a particular string
-#' @examples tesseract_params('debug')
+#' @examples tesseract_params("debug")
 tesseract_params <- function(filter = ""){
   tmp <- print_params(tempfile())
   on.exit(unlink(tmp))