From 7e91cf32bb5f0d51f0b4540d131c5934cb9194a4 Mon Sep 17 00:00:00 2001 From: Yihui Xie Date: Thu, 1 Aug 2024 15:44:00 -0500 Subject: [PATCH] factor out regmatches() + gregexpr()/gregexec()/regexec() --- DESCRIPTION | 2 +- R/fuse.R | 8 ++++---- R/mark.R | 2 +- R/utils.R | 39 ++++++++++++++++++++++++++------------- 4 files changed, 32 insertions(+), 19 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 34d40c5..4646266 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: litedown Type: Package Title: A Lightweight Version of R Markdown -Version: 0.0.32 +Version: 0.0.33 Authors@R: c( person("Yihui", "Xie", role = c("aut", "cre"), email = "xie@yihui.name", comment = c(ORCID = "0000-0003-0645-5666")), person() diff --git a/R/fuse.R b/R/fuse.R index f524f39..746d07c 100644 --- a/R/fuse.R +++ b/R/fuse.R @@ -48,7 +48,7 @@ crack = function(input, text = NULL) { '<(code|code_block) sourcepos="(\\d+):(\\d+)-(\\d+):(\\d+)"( info="[{]+', rx_engine, '[^"]*?[}]")? xml:space="[^>]*>([^<]*)<' ) - m = regmatches(xml, gregexec(r, xml, perl = TRUE))[[1]] %|% matrix(character(), 9) + m = match_all(xml, r, perl = TRUE)[[1]] %|% matrix(character(), 9) # code blocks must have non-empty info strings m = m[, m[2, ] != 'code_block' | m[8, ] != '', drop = FALSE] @@ -127,7 +127,7 @@ crack = function(input, text = NULL) { } # possible comma-separated chunk options in header rx_opts = paste0('^(`{3,}|~{3,})\\s*([{]+)', rx_engine, '(.*?)\\s*[}]+\\s*$') - o = regmatches(code[1], regexec(rx_opts, code[1]))[[1]] + o = match_one(code[1], rx_opts)[[1]] if (length(o)) { # if two or more `{` is used, we will write chunk fences to output if (nchar(o[3]) > 1) b$fences = c( @@ -169,7 +169,7 @@ crack = function(input, text = NULL) { x = lapply(seq_along(x), function(i) { z = x[i] if (i %% 2 == 1) return(z) - z = regmatches(z, regexec(rx_inline, z))[[1]][-1] + z = match_one(z, rx_inline)[[1]][-1] p2 = pos[, i / 2]; save_pos(p2) list( source = z[2], pos = p2, @@ -796,7 +796,7 @@ sci_num = function(x) { r = '^(-)?([0-9.]+)e([-+])0*([0-9]+)$' x = format(signif(x, s), scientific = x != 0 && abs(log10(abs(x))) >= p) if (!grepl(r, x)) return(x) - n = regmatches(x, regexec(r, x))[[1]] + n = match_one(x, r)[[1]] sprintf( '%s%s10^{%s%s}', n[2], if (n[3] == '1') '' else paste(n[3], '\\times '), if (n[4] == '+') '' else n[4], n[5] diff --git a/R/mark.R b/R/mark.R index 044e059..d51717c 100644 --- a/R/mark.R +++ b/R/mark.R @@ -388,7 +388,7 @@ build_output = function(format, options, template, meta) { # substitute all variables in template with their values sub_vars = function(tpl, meta) { # find all variables in the template - vars = unlist(regmatches(tpl, gregexpr('[$][-_[:alnum:]]+[$]', tpl))) + vars = unlist(match_full(tpl, '[$][-_[:alnum:]]+[$]')) # insert $body$ at last in case the body contain any $variables$ accidentally if (!is.na(i <- match('$body$', vars))) vars = c(vars[-i], vars[i]) for (v in vars) { diff --git a/R/utils.R b/R/utils.R index a7ae238..7f062dc 100644 --- a/R/utils.R +++ b/R/utils.R @@ -89,14 +89,31 @@ id_string = function(text, lens = c(5:10, 20), times = 20) { } # a shorthand for gregexpr() and regmatches() -match_replace = function(x, pattern, replace = identity, ...) { - m = gregexpr(pattern, x, ...) +match_replace = function(x, r, replace = identity, ...) { + m = gregexpr(r, x, ...) regmatches(x, m) = lapply(regmatches(x, m), function(z) { if (length(z)) replace(z) else z }) x } +# gregexec() + regmatches() to greedy-match all substrings in regex groups +match_all = function(x, r, ...) { + regmatches(x, base::gregexec(r, x, ...)) +} +# for R < 4.1.0 +if (!exists('gregexec', baseenv(), inherits = TRUE)) match_all = function(x, r, ...) { + lapply(match_full(x, r, ...), function(z) { + if (length(z)) do.call(cbind, match_one(z, r, ...)) else z + }) +} + +# regexec() + regmatches() to match the regex once and capture substrings +match_one = function(x, r, ...) regmatches(x, regexec(r, x, ...)) + +# gregexpr() + regmatches() to match full strings but not substrings in regex groups +match_full = function(x, r, ...) regmatches(x, gregexpr(r, x, ...)) + # if `text` is NULL and `input` is a file, read it; otherwise use the `text` # argument as input read_input = function(input, text) { @@ -274,7 +291,7 @@ set_highlight = function(meta, options, html) { autoloader = 'plugins/autoloader/prism-autoloader.min.js' o$js = c(o$js, if (!is.null(l <- o$languages)) get_lang(l) else { # detect languages in html and load necessary language components - lang = unlist(regmatches(html, gregexpr(r, html))) + lang = unlist(match_full(html, r)) lang = gsub(' .*', '', lang) # only use the first class name lang = setdiff(lang, 'plain') # exclude known non-existent names f = switch(p, highlight = js_libs[[c(p, 'js')]], prism = autoloader) @@ -312,8 +329,7 @@ lang_files = function(package, path, langs) { x = grep(r, x, value = TRUE) l = gsub(r, '\\1', x) # then find their aliases - m = gregexpr('(?<=aliases:\\[)[^]]+(?=\\])', x) - a = lapply(regmatches(x, m), function(z) { + a = lapply(match_full(x, '(?<=aliases:\\[)[^]]+(?=\\])'), function(z) { z = unlist(strsplit(z, '[",]')) z[!xfun::is_blank(z)] }) @@ -330,8 +346,7 @@ lang_files = function(package, path, langs) { l1 } else { # dependencies and aliases (the arrays should be more than 1000 characters) - m = gregexpr('(?<=\\{)([[:alnum:]_-]+:\\[?"[^}]{1000,})(?=\\})', x) - x = unlist(regmatches(x, m)) + x = unlist(match_full(x, '(?<=\\{)([[:alnum:]_-]+:\\[?"[^}]{1000,})(?=\\})')) if (length(x) < 2) { warning( "Unable to process Prism's autoloader plugin (", u, ") to figure out ", @@ -341,8 +356,7 @@ lang_files = function(package, path, langs) { return() } x = x[1:2] - m = gregexpr('([[:alnum:]_-]+):(\\["[^]]+\\]|"[^"]+")', x) - x = lapply(regmatches(x, m), function(z) { + x = lapply(match_full(x, '([[:alnum:]_-]+):(\\["[^]]+\\]|"[^"]+")'), function(z) { z = gsub('[]["]', '', z) unlist(lapply(strsplit(z, '[:,]'), function(y) { set_names(list(y[-1]), y[1]) @@ -482,7 +496,7 @@ build_toc = function(html, n = 3) { if (n <= 0) return() if (n > 6) n = 6 r = sprintf('<(h[1-%d])( id="[^"]+")?[^>]*>(.+?)', n) - items = unlist(regmatches(html, gregexpr(r, html))) + items = unlist(match_full(html, r)) if (length(items) == 0) return() x = gsub(r, '\\3', items) # use a tag to protect heading text h = as.integer(gsub('^h', '', gsub(r, '\\1', items))) # heading level @@ -680,8 +694,7 @@ unique_id = function(x, empty) { # number sections in HTML output number_sections = function(x) { - m = gregexpr('', x) - h = sub('', '\\1', unlist(regmatches(x, m))) + h = sub('', '\\1', unlist(match_full(x, ''))) if (length(h) == 0) return(x) # no headings h = min(as.integer(h)) # highest level of headings r = ']*)>(?!]*>([0-9.]+)' - m = regmatches(x, gregexec(r2, x))[[1]] + m = match_all(x, r2)[[1]] if (length(m)) { ids = m[2, ] db = as.list(set_names(m[4, ], ids))