tesseract reorganized documentation links

ropensci · Jul 27, 2024 · f81583e · f81583e
1 parent 6535e9e
commit f81583e
Show file tree

Hide file tree

Showing 11 changed files with 56 additions and 30 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,8 +2,6 @@
 ^\.Rproj\.user$
 ^src/Makevars$
 ^windows
-\.pdf$
-\.png$
 \.webp$
 \.jpeg$
 \.o$
@@ -14,3 +12,4 @@
 vignettes/.*\.png$
 ^configure.log$
 ^\.github$
+^\.vscode$
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,4 @@ inst/tessdata
 windows
 src/Makevars
 configure.log
-.vscode/settings.json
-.vscode/launch.json
-.vscode/c_cpp_properties.json
+.vscode
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,22 +1,28 @@
 Package: tesseract
 Type: Package
 Title: Open Source OCR Engine
-Version: 5.2.1
-Authors@R: person("Jeroen", "Ooms", role = c("aut", "cre"), email = "[email protected]",
-    comment = c(ORCID = "0000-0002-4035-0289"))
+Version: 5.3.0
+Authors@R: c(person("Jeroen", "Ooms",
+                    role = c("aut", "cre"),
+                    email = "[email protected]",
+                    comment = c(ORCID = "0000-0002-4035-0289")),
+             person("Mauricio", "Vargas Sepulveda",
+                    role = "aut", 
+                    email = "[email protected]", 
+                    comment = c(ORCID = "0000-0003-1017-7574")))
 Description: Bindings to 'Tesseract': 
-     a powerful optical character recognition (OCR) engine that supports over 100 languages.
-     The engine is highly configurable in order to tune the detection algorithms and
-     obtain the best possible results.
+     a powerful optical character recognition (OCR) engine that supports over
+     100 languages. The engine is highly configurable in order to tune the
+     detection algorithms and obtain the best possible results.
 License: Apache License 2.0
 URL: https://docs.ropensci.org/tesseract/ (website) 
     https://github.com/ropensci/tesseract (devel)
 BugReports: https://github.com/ropensci/tesseract/issues
 SystemRequirements: Tesseract >= 3.03 (libtesseract-dev / tesseract-devel) and
-    Leptonica (libleptonica-dev / leptonica-devel). On Debian you need to install
-    the English training data separately (tesseract-ocr-eng)
+    Leptonica (libleptonica-dev / leptonica-devel). On Debian you need to
+    install the English and other languages training data separately
+    (e.g. tesseract-ocr-eng or tesseract-ocr-spa).
 Imports:
-    Rcpp (>= 0.12.12),
     pdftools (>= 1.5),    
     curl,
     rappdirs,

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,5 @@ export(tesseract)
 export(tesseract_download)
 export(tesseract_info)
 export(tesseract_params)
-importFrom(Rcpp,sourceCpp)
 useDynLib(tesseract)
 useDynLib(tesseract, .registration = TRUE)
diff --git a/NEWS b/NEWS
@@ -1,3 +1,7 @@
+5.3.0
+  - The C++ parts were refactored to use cpp11 instead of Rcpp, and therefore
+    it now allows to use vendoring.
+
 5.2.1
   - Fix shell script for cross compilation
 

diff --git a/R/ocr.R b/R/ocr.R
@@ -18,7 +18,6 @@
 #' @param HOCR if `TRUE` return results as HOCR xml instead of plain text
 #' @rdname ocr
 #' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
-#' @importFrom Rcpp sourceCpp
 #' @examples # Simple example
 #' text <- ocr("https://jeroen.github.io/images/testocr.png")
 #' cat(text)

diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -15,12 +15,15 @@ Magick
 Nederlands
 ocr
 opensource
+ORCID
 pdftools
 png
 rmarkdown
 spanish
+Sepulveda
 tessdata
 toc
+utrecht
 VignetteEncoding
 VignetteEngine
 VignetteIndexEntry
diff --git a/inst/examples/bowers.jpg b/inst/examples/bowers.jpg
diff --git a/man/tesseract-package.Rd b/man/tesseract-package.Rd
diff --git a/src/tesseract_types.h b/src/tesseract_types.h
@@ -14,6 +14,17 @@ inline void tess_finalizer(tesseract::TessBaseAPI* engine) {
 
 typedef cpp11::external_pointer<tesseract::TessBaseAPI> TessPtr;
 
-inline TessPtr make_tess_ptr(tesseract::TessBaseAPI* engine) {
+inline void set_tesseract_options(tesseract::TessBaseAPI* engine,
+                                  cpp11::list options) {
+  for (int i = 0; i < options.size(); ++i) {
+    std::string key = cpp11::as_cpp<std::string>(options.names()[i]);
+    std::string value = cpp11::as_cpp<std::string>(options[i]);
+    engine->SetVariable(key.c_str(), value.c_str());
+  }
+}
+
+inline TessPtr make_tess_ptr(tesseract::TessBaseAPI* engine,
+                             cpp11::list options = cpp11::list()) {
+  set_tesseract_options(engine, options);
   return TessPtr(engine, tess_finalizer);
-}
+}
diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd
@@ -32,7 +32,7 @@ Keep in mind that OCR (pattern recognition in general) is a very difficult probl
 
 OCR is the process of finding and recognizing text inside images, for example from a screenshot, scanned paper. The image below has some example text:
 
-![test](https://jeroen.github.io/images/testocr.png){data-external=1}
+![test](../inst/examples/testocr.png){data-external=1}
 
 ```{r}
 library(tesseract)
@@ -60,7 +60,7 @@ tesseract_info()
 
 By default the R package only includes English training data. Windows and Mac users can install additional training data using `tesseract_download()`. Let's OCR a screenshot from Wikipedia in Dutch (Nederlands) 
 
-[![utrecht](https://jeroen.github.io/images/utrecht2.png)](https://nl.wikipedia.org/wiki/Geschiedenis_van_de_stad_Utrecht)
+[![utrecht](../inst/examples/utrecht2.png)](https://nl.wikipedia.org/wiki/Geschiedenis_van_de_stad_Utrecht)
 
 ```{r, eval=FALSE}
 # Only need to do download once:
@@ -70,7 +70,8 @@ tesseract_download("nld")
 ```{r eval = has_nld}
 # Now load the dictionary
 (dutch <- tesseract("nld"))
-text <- ocr("https://jeroen.github.io/images/utrecht2.png", engine = dutch)
+file <- system.file("examples", "utrecht2.png", package = "tesseract")
+text <- ocr(file, engine = dutch)
 cat(text)
 ```
 
@@ -79,7 +80,7 @@ As you can see immediately: almost perfect! (OK just take my word).
 
 ## Preprocessing with Magick
 
-The accuracy of the OCR process depends on the quality of the input image. You can often improve results by properly scaling the image, removing noise and artifacts or cropping the area where the text exists. See [tesseract wiki: improve quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality) for important tips to improve the quality of your input image.
+The accuracy of the OCR process depends on the quality of the input image. You can often improve results by properly scaling the image, removing noise and artifacts or cropping the area where the text exists. See [tesseract wiki: improve quality](https://tesseract-ocr.github.io/tessdoc/ImproveQuality) for important tips to improve the quality of your input image.
 
 The awesome [magick](https://cran.r-project.org/package=magick/vignettes/intro.html) R package has many useful functions that can be use for enhancing the quality of the image. Some things to try:
 
@@ -94,12 +95,13 @@ The awesome [magick](https://cran.r-project.org/package=magick/vignettes/intro.h
 
 Below is an example OCR scan. The code converts it to black-and-white and resizes + crops the image before feeding it to tesseract to get more accurate OCR results.
 
-![bowers](https://jeroen.github.io/images/bowers.jpg){data-external=1}
+![bowers](../inst/examples/bowers.jpg){data-external=1}
 
 
 ```{r}
 library(magick)
-input <- image_read("https://jeroen.github.io/images/bowers.jpg")
+file <- system.file("examples", "bowers.jpg", package = "tesseract")
+input <- image_read(file)
 
 text <- input %>%
   image_resize("2000x") %>%
@@ -117,7 +119,8 @@ cat(text)
 If your images are stored in PDF files they first need to be converted to a proper image format. We can do this in R using the `pdf_convert` function from the pdftools package. Use a high DPI to keep quality of the image.
 
 ```{r, eval=require(pdftools)}
-pngfile <- pdftools::pdf_convert('https://jeroen.github.io/images/ocrscan.pdf', dpi = 600)
+file <- system.file("examples", "ocrscan.pdf", package = "tesseract")
+pngfile <- pdftools::pdf_convert(file, dpi = 600)
 text <- tesseract::ocr(pngfile)
 cat(text)
 ```
@@ -144,19 +147,18 @@ One powerful parameter is `tessedit_char_whitelist` which restricts the output t
 
 The whitelist parameter works for all versions of Tesseract engine 3 and also engine versions 4.1 and higher, but unfortunately it did not work in Tesseract 4.0.
 
-![receipt](https://jeroen.github.io/images/receipt.png){data-external=1}
+![receipt](../inst/examples/receipt.png){data-external=1}
 
 ```{r}
-Sys.setenv(TESSDATA_PREFIX = tempdir())
 numbers <- tesseract(options = list(tessedit_char_whitelist = "$.0123456789"))
-cat(ocr("https://jeroen.github.io/images/receipt.png", engine = numbers))
+file <- system.file("examples", "receipt.png", package = "tesseract")
+cat(ocr(file, engine = numbers))
 ```
 
 To test if this actually works, look what happens if we remove the `$` from `tessedit_char_whitelist`:
 
 ```{r}
 # Do not allow any dollar sign 
 numbers2 <- tesseract(options = list(tessedit_char_whitelist = ".0123456789"))
-cat(ocr("https://jeroen.github.io/images/receipt.png", engine = numbers2))
+cat(ocr(file, engine = numbers2))
 ```
-