From 64aa95ef6fec511194ef38f755c93ba751042bcb Mon Sep 17 00:00:00 2001
From: Mauricio 'Pacha' Vargas Sepulveda <m.sepulveda@mail.utoronto.ca>
Date: Fri, 2 Aug 2024 21:10:31 -0400
Subject: [PATCH] small PR to download best models with Tesseract 4 or higher

---
 DESCRIPTION     |  2 +-
 R/tessdata.R    | 48 +++++++++++++++++++++++++++++++++---------------
 man/tessdata.Rd | 15 +++++++++++----
 3 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 838561b..d96ab6b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -22,7 +22,7 @@ Imports:
     rappdirs,
     digest
 LinkingTo: Rcpp
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Roxygen: list(markdown = TRUE)
 Suggests:
     magick (>= 1.7),
diff --git a/R/tessdata.R b/R/tessdata.R
index 9ffeedf..5ca10af 100644
--- a/R/tessdata.R
+++ b/R/tessdata.R
@@ -1,8 +1,7 @@
 #' Tesseract Training Data
 #'
 #' Helper function to download training data from the official
-#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. Only use this function on
-#' Windows and OS-X. On Linux, training data can be installed directly with
+#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. On Linux, the fast training data can be installed directly with
 #' [yum](https://src.fedoraproject.org/rpms/tesseract) or
 #' [apt-get](https://packages.debian.org/search?suite=stable&section=all&arch=any&searchon=names&keywords=tesseract-ocr-).
 #'
@@ -23,59 +22,78 @@
 #' @family tesseract
 #' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository.
 #' @param datapath destination directory where to download store the file
+#' @param best download the most accurate (but slower) trained models for Tesseract 4.0 or higher
 #' @param progress print progress while downloading
 #' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files)
 #' @examples \dontrun{
-#' if(is.na(match("fra", tesseract_info()$available)))
+#' if (is.na(match("fra", tesseract_info()$available))) {
 #'   tesseract_download("fra")
+#' }
 #' french <- tesseract("fra")
 #' text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french)
 #' cat(text)
 #' }
-tesseract_download <- function(lang, datapath = NULL, progress = interactive()){
+tesseract_download <- function(lang, datapath = NULL, best = FALSE, progress = interactive()) {
   stopifnot(is.character(lang))
-  if(!length(datapath)){
+
+  if (!length(datapath)) {
     warn_on_linux()
     datapath <- tesseract_info()$datapath
   }
   datapath <- normalizePath(datapath, mustWork = TRUE)
   version <- tesseract_version_major()
+
   if(version < 4){
     repo <- "tessdata"
     release <- "3.04.00"
   } else {
-    repo <- "tessdata_fast"
+    repo <- ifelse(best, "tessdata_best", "tessdata_fast")
     release <- "4.1.0"
   }
-  url <- sprintf('https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata', repo, release, lang)
+
+  url <- sprintf("https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata", repo, release, lang)
+
+  destfile <- file.path(datapath, basename(url))
+
+  if (file.exists(destfile)) {
+    message("Training data already exists.")
+    return(destfile)
+  }
+
   req <- curl::curl_fetch_memory(url, curl::new_handle(
     progressfunction = progress_fun,
     noprogress = !isTRUE(progress)
   ))
-  if(progress)
+
+  if (progress) {
     cat("\n")
-  if(req$status_code != 200)
+  }
+
+  if (req$status_code != 200) {
     stop("Download failed: HTTP ", req$status_code, call. = FALSE)
-  destfile <- file.path(datapath, basename(url))
+  }
+
   writeBin(req$content, destfile)
+
   return(destfile)
 }
 
 progress_fun <- function(down, up) {
   total <- down[[1]]
   now <- down[[2]]
-  pct <- if(length(total) && total > 0){
-    paste0("(", round(now/total * 100), "%)")
+  pct <- if (length(total) && total > 0) {
+    paste0("(", round(now / total * 100), "%)")
   } else {
     ""
   }
-  if(now > 10000)
+  if (now > 10000) {
     cat("\r Downloaded:", sprintf("%.2f", now / 2^20), "MB ", pct)
+  }
   TRUE
 }
 
-warn_on_linux <- function(){
-  if(identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")){
+warn_on_linux <- function() {
+  if (identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")) {
     warning("On Linux you should install training data via yum/apt. Please check the manual page.", call. = FALSE)
   }
 }
diff --git a/man/tessdata.Rd b/man/tessdata.Rd
index d74e3d9..242d516 100644
--- a/man/tessdata.Rd
+++ b/man/tessdata.Rd
@@ -5,19 +5,25 @@
 \alias{tessdata}
 \title{Tesseract Training Data}
 \usage{
-tesseract_download(lang, datapath = NULL, progress = interactive())
+tesseract_download(
+  lang,
+  datapath = NULL,
+  best = FALSE,
+  progress = interactive()
+)
 }
 \arguments{
 \item{lang}{three letter code for language, see \href{https://github.com/tesseract-ocr/tessdata}{tessdata} repository.}
 
 \item{datapath}{destination directory where to download store the file}
 
+\item{best}{download the most accurate (but slower) trained models for Tesseract 4.0 or higher}
+
 \item{progress}{print progress while downloading}
 }
 \description{
 Helper function to download training data from the official
-\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. Only use this function on
-Windows and OS-X. On Linux, training data can be installed directly with
+\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. On Linux, the fast training data can be installed directly with
 \href{https://src.fedoraproject.org/rpms/tesseract}{yum} or
 \href{https://packages.debian.org/search?suite=stable&section=all&arch=any&searchon=names&keywords=tesseract-ocr-}{apt-get}.
 }
@@ -36,8 +42,9 @@ and stores it in a the path on disk given by the \code{TESSDATA_PREFIX} variable
 }
 \examples{
 \dontrun{
-if(is.na(match("fra", tesseract_info()$available)))
+if (is.na(match("fra", tesseract_info()$available))) {
   tesseract_download("fra")
+}
 french <- tesseract("fra")
 text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french)
 cat(text)