From 64aa95ef6fec511194ef38f755c93ba751042bcb Mon Sep 17 00:00:00 2001 From: Mauricio 'Pacha' Vargas Sepulveda Date: Fri, 2 Aug 2024 21:10:31 -0400 Subject: [PATCH] small PR to download best models with Tesseract 4 or higher --- DESCRIPTION | 2 +- R/tessdata.R | 48 +++++++++++++++++++++++++++++++++--------------- man/tessdata.Rd | 15 +++++++++++---- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 838561b..d96ab6b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,7 @@ Imports: rappdirs, digest LinkingTo: Rcpp -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) Suggests: magick (>= 1.7), diff --git a/R/tessdata.R b/R/tessdata.R index 9ffeedf..5ca10af 100644 --- a/R/tessdata.R +++ b/R/tessdata.R @@ -1,8 +1,7 @@ #' Tesseract Training Data #' #' Helper function to download training data from the official -#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. Only use this function on -#' Windows and OS-X. On Linux, training data can be installed directly with +#' [tessdata](https://tesseract-ocr.github.io/tessdoc/Data-Files) repository. On Linux, the fast training data can be installed directly with #' [yum](https://src.fedoraproject.org/rpms/tesseract) or #' [apt-get](https://packages.debian.org/search?suite=stable§ion=all&arch=any&searchon=names&keywords=tesseract-ocr-). #' @@ -23,59 +22,78 @@ #' @family tesseract #' @param lang three letter code for language, see [tessdata](https://github.com/tesseract-ocr/tessdata) repository. #' @param datapath destination directory where to download store the file +#' @param best download the most accurate (but slower) trained models for Tesseract 4.0 or higher #' @param progress print progress while downloading #' @references [tesseract wiki: training data](https://tesseract-ocr.github.io/tessdoc/Data-Files) #' @examples \dontrun{ -#' if(is.na(match("fra", tesseract_info()$available))) +#' if (is.na(match("fra", tesseract_info()$available))) { #' tesseract_download("fra") +#' } #' french <- tesseract("fra") #' text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french) #' cat(text) #' } -tesseract_download <- function(lang, datapath = NULL, progress = interactive()){ +tesseract_download <- function(lang, datapath = NULL, best = FALSE, progress = interactive()) { stopifnot(is.character(lang)) - if(!length(datapath)){ + + if (!length(datapath)) { warn_on_linux() datapath <- tesseract_info()$datapath } datapath <- normalizePath(datapath, mustWork = TRUE) version <- tesseract_version_major() + if(version < 4){ repo <- "tessdata" release <- "3.04.00" } else { - repo <- "tessdata_fast" + repo <- ifelse(best, "tessdata_best", "tessdata_fast") release <- "4.1.0" } - url <- sprintf('https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata', repo, release, lang) + + url <- sprintf("https://github.com/tesseract-ocr/%s/raw/%s/%s.traineddata", repo, release, lang) + + destfile <- file.path(datapath, basename(url)) + + if (file.exists(destfile)) { + message("Training data already exists.") + return(destfile) + } + req <- curl::curl_fetch_memory(url, curl::new_handle( progressfunction = progress_fun, noprogress = !isTRUE(progress) )) - if(progress) + + if (progress) { cat("\n") - if(req$status_code != 200) + } + + if (req$status_code != 200) { stop("Download failed: HTTP ", req$status_code, call. = FALSE) - destfile <- file.path(datapath, basename(url)) + } + writeBin(req$content, destfile) + return(destfile) } progress_fun <- function(down, up) { total <- down[[1]] now <- down[[2]] - pct <- if(length(total) && total > 0){ - paste0("(", round(now/total * 100), "%)") + pct <- if (length(total) && total > 0) { + paste0("(", round(now / total * 100), "%)") } else { "" } - if(now > 10000) + if (now > 10000) { cat("\r Downloaded:", sprintf("%.2f", now / 2^20), "MB ", pct) + } TRUE } -warn_on_linux <- function(){ - if(identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")){ +warn_on_linux <- function() { + if (identical(.Platform$OS.type, "unix") && !identical(Sys.info()[["sysname"]], "Darwin")) { warning("On Linux you should install training data via yum/apt. Please check the manual page.", call. = FALSE) } } diff --git a/man/tessdata.Rd b/man/tessdata.Rd index d74e3d9..242d516 100644 --- a/man/tessdata.Rd +++ b/man/tessdata.Rd @@ -5,19 +5,25 @@ \alias{tessdata} \title{Tesseract Training Data} \usage{ -tesseract_download(lang, datapath = NULL, progress = interactive()) +tesseract_download( + lang, + datapath = NULL, + best = FALSE, + progress = interactive() +) } \arguments{ \item{lang}{three letter code for language, see \href{https://github.com/tesseract-ocr/tessdata}{tessdata} repository.} \item{datapath}{destination directory where to download store the file} +\item{best}{download the most accurate (but slower) trained models for Tesseract 4.0 or higher} + \item{progress}{print progress while downloading} } \description{ Helper function to download training data from the official -\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. Only use this function on -Windows and OS-X. On Linux, training data can be installed directly with +\href{https://tesseract-ocr.github.io/tessdoc/Data-Files}{tessdata} repository. On Linux, the fast training data can be installed directly with \href{https://src.fedoraproject.org/rpms/tesseract}{yum} or \href{https://packages.debian.org/search?suite=stable§ion=all&arch=any&searchon=names&keywords=tesseract-ocr-}{apt-get}. } @@ -36,8 +42,9 @@ and stores it in a the path on disk given by the \code{TESSDATA_PREFIX} variable } \examples{ \dontrun{ -if(is.na(match("fra", tesseract_info()$available))) +if (is.na(match("fra", tesseract_info()$available))) { tesseract_download("fra") +} french <- tesseract("fra") text <- ocr("https://jeroen.github.io/images/french_text.png", engine = french) cat(text)