From 7e91cf32bb5f0d51f0b4540d131c5934cb9194a4 Mon Sep 17 00:00:00 2001
From: Yihui Xie <xie@yihui.name>
Date: Thu, 1 Aug 2024 15:44:00 -0500
Subject: [PATCH] factor out regmatches() + gregexpr()/gregexec()/regexec()

---
 DESCRIPTION |  2 +-
 R/fuse.R    |  8 ++++----
 R/mark.R    |  2 +-
 R/utils.R   | 39 ++++++++++++++++++++++++++-------------
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 34d40c5..4646266 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: litedown
 Type: Package
 Title: A Lightweight Version of R Markdown
-Version: 0.0.32
+Version: 0.0.33
 Authors@R: c(
    person("Yihui", "Xie", role = c("aut", "cre"), email = "xie@yihui.name", comment = c(ORCID = "0000-0003-0645-5666")),
    person()
diff --git a/R/fuse.R b/R/fuse.R
index f524f39..746d07c 100644
--- a/R/fuse.R
+++ b/R/fuse.R
@@ -48,7 +48,7 @@ crack = function(input, text = NULL) {
     '<(code|code_block) sourcepos="(\\d+):(\\d+)-(\\d+):(\\d+)"( info="[{]+',
     rx_engine, '[^"]*?[}]")? xml:space="[^>]*>([^<]*)<'
   )
-  m = regmatches(xml, gregexec(r, xml, perl = TRUE))[[1]] %|% matrix(character(), 9)
+  m = match_all(xml, r, perl = TRUE)[[1]] %|% matrix(character(), 9)
   # code blocks must have non-empty info strings
   m = m[, m[2, ] != 'code_block' | m[8, ] != '', drop = FALSE]
 
@@ -127,7 +127,7 @@ crack = function(input, text = NULL) {
       }
       # possible comma-separated chunk options in header
       rx_opts = paste0('^(`{3,}|~{3,})\\s*([{]+)', rx_engine, '(.*?)\\s*[}]+\\s*$')
-      o = regmatches(code[1], regexec(rx_opts, code[1]))[[1]]
+      o = match_one(code[1], rx_opts)[[1]]
       if (length(o)) {
         # if two or more `{` is used, we will write chunk fences to output
         if (nchar(o[3]) > 1) b$fences = c(
@@ -169,7 +169,7 @@ crack = function(input, text = NULL) {
       x = lapply(seq_along(x), function(i) {
         z = x[i]
         if (i %% 2 == 1) return(z)
-        z = regmatches(z, regexec(rx_inline, z))[[1]][-1]
+        z = match_one(z, rx_inline)[[1]][-1]
         p2 = pos[, i / 2]; save_pos(p2)
         list(
           source = z[2], pos = p2,
@@ -796,7 +796,7 @@ sci_num = function(x) {
   r = '^(-)?([0-9.]+)e([-+])0*([0-9]+)$'
   x = format(signif(x, s), scientific = x != 0 && abs(log10(abs(x))) >= p)
   if (!grepl(r, x)) return(x)
-  n = regmatches(x, regexec(r, x))[[1]]
+  n = match_one(x, r)[[1]]
   sprintf(
     '%s%s10^{%s%s}', n[2], if (n[3] == '1') '' else paste(n[3], '\\times '),
     if (n[4] == '+') '' else n[4], n[5]
diff --git a/R/mark.R b/R/mark.R
index 044e059..d51717c 100644
--- a/R/mark.R
+++ b/R/mark.R
@@ -388,7 +388,7 @@ build_output = function(format, options, template, meta) {
 # substitute all variables in template with their values
 sub_vars = function(tpl, meta) {
   # find all variables in the template
-  vars = unlist(regmatches(tpl, gregexpr('[$][-_[:alnum:]]+[$]', tpl)))
+  vars = unlist(match_full(tpl, '[$][-_[:alnum:]]+[$]'))
   # insert $body$ at last in case the body contain any $variables$ accidentally
   if (!is.na(i <- match('$body$', vars))) vars = c(vars[-i], vars[i])
   for (v in vars) {
diff --git a/R/utils.R b/R/utils.R
index a7ae238..7f062dc 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -89,14 +89,31 @@ id_string = function(text, lens = c(5:10, 20), times = 20) {
 }
 
 # a shorthand for gregexpr() and regmatches()
-match_replace = function(x, pattern, replace = identity, ...) {
-  m = gregexpr(pattern, x, ...)
+match_replace = function(x, r, replace = identity, ...) {
+  m = gregexpr(r, x, ...)
   regmatches(x, m) = lapply(regmatches(x, m), function(z) {
     if (length(z)) replace(z) else z
   })
   x
 }
 
+# gregexec() + regmatches() to greedy-match all substrings in regex groups
+match_all = function(x, r, ...) {
+  regmatches(x, base::gregexec(r, x, ...))
+}
+# for R < 4.1.0
+if (!exists('gregexec', baseenv(), inherits = TRUE)) match_all = function(x, r, ...) {
+  lapply(match_full(x, r, ...), function(z) {
+    if (length(z)) do.call(cbind, match_one(z, r, ...)) else z
+  })
+}
+
+# regexec() + regmatches() to match the regex once and capture substrings
+match_one = function(x, r, ...) regmatches(x, regexec(r, x, ...))
+
+# gregexpr() + regmatches() to match full strings but not substrings in regex groups
+match_full = function(x, r, ...) regmatches(x, gregexpr(r, x, ...))
+
 # if `text` is NULL and `input` is a file, read it; otherwise use the `text`
 # argument as input
 read_input = function(input, text) {
@@ -274,7 +291,7 @@ set_highlight = function(meta, options, html) {
   autoloader = 'plugins/autoloader/prism-autoloader.min.js'
   o$js = c(o$js, if (!is.null(l <- o$languages)) get_lang(l) else {
     # detect <code> languages in html and load necessary language components
-    lang = unlist(regmatches(html, gregexpr(r, html)))
+    lang = unlist(match_full(html, r))
     lang = gsub(' .*', '', lang)  # only use the first class name
     lang = setdiff(lang, 'plain')  # exclude known non-existent names
     f = switch(p, highlight = js_libs[[c(p, 'js')]], prism = autoloader)
@@ -312,8 +329,7 @@ lang_files = function(package, path, langs) {
     x = grep(r, x, value = TRUE)
     l = gsub(r, '\\1', x)
     # then find their aliases
-    m = gregexpr('(?<=aliases:\\[)[^]]+(?=\\])', x)
-    a = lapply(regmatches(x, m), function(z) {
+    a = lapply(match_full(x, '(?<=aliases:\\[)[^]]+(?=\\])'), function(z) {
       z = unlist(strsplit(z, '[",]'))
       z[!xfun::is_blank(z)]
     })
@@ -330,8 +346,7 @@ lang_files = function(package, path, langs) {
     l1
   } else {
     # dependencies and aliases (the arrays should be more than 1000 characters)
-    m = gregexpr('(?<=\\{)([[:alnum:]_-]+:\\[?"[^}]{1000,})(?=\\})', x)
-    x = unlist(regmatches(x, m))
+    x = unlist(match_full(x, '(?<=\\{)([[:alnum:]_-]+:\\[?"[^}]{1000,})(?=\\})'))
     if (length(x) < 2) {
       warning(
         "Unable to process Prism's autoloader plugin (", u, ") to figure out ",
@@ -341,8 +356,7 @@ lang_files = function(package, path, langs) {
       return()
     }
     x = x[1:2]
-    m = gregexpr('([[:alnum:]_-]+):(\\["[^]]+\\]|"[^"]+")', x)
-    x = lapply(regmatches(x, m), function(z) {
+    x = lapply(match_full(x, '([[:alnum:]_-]+):(\\["[^]]+\\]|"[^"]+")'), function(z) {
       z = gsub('[]["]', '', z)
       unlist(lapply(strsplit(z, '[:,]'), function(y) {
         set_names(list(y[-1]), y[1])
@@ -482,7 +496,7 @@ build_toc = function(html, n = 3) {
   if (n <= 0) return()
   if (n > 6) n = 6
   r = sprintf('<(h[1-%d])( id="[^"]+")?[^>]*>(.+?)</\\1>', n)
-  items = unlist(regmatches(html, gregexpr(r, html)))
+  items = unlist(match_full(html, r))
   if (length(items) == 0) return()
   x = gsub(r, '<toc\\2>\\3</toc>', items)  # use a tag <toc> to protect heading text
   h = as.integer(gsub('^h', '', gsub(r, '\\1', items)))  # heading level
@@ -680,8 +694,7 @@ unique_id = function(x, empty) {
 
 # number sections in HTML output
 number_sections = function(x) {
-  m = gregexpr('</h[1-6]>', x)
-  h = sub('</h([1-6])>', '\\1', unlist(regmatches(x, m)))
+  h = sub('</h([1-6])>', '\\1', unlist(match_full(x, '</h[1-6]>')))
   if (length(h) == 0) return(x)  # no headings
   h = min(as.integer(h))  # highest level of headings
   r = '<h([1-6])([^>]*)>(?!<span class="section-number)'
@@ -749,7 +762,7 @@ number_refs = function(x, r) {
 
   # first, find numbered section headings
   r2 = '<h[1-6][^>]*? id="((sec|chp)-[^"]+)"[^>]*><span class="section-number[^"]*">([0-9.]+)</span>'
-  m = regmatches(x, gregexec(r2, x))[[1]]
+  m = match_all(x, r2)[[1]]
   if (length(m)) {
     ids = m[2, ]
     db = as.list(set_names(m[4, ], ids))