From d4ed0d0c4accad9fb189ed77688cd75dca6a10c6 Mon Sep 17 00:00:00 2001
From: "Andrew G. Brown" <andrew.g.brown@usda.gov>
Date: Fri, 2 Jul 2021 20:13:59 -0700
Subject: [PATCH] parseOSD updates to use SKB sections

---
 NAMESPACE              |   3 +
 R/create_OSD.R         |  10 +-
 R/parseOSD_functions.R | 424 +++++++++++++++++++++--------------------
 3 files changed, 225 insertions(+), 212 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 3e282d18be..1e5382689e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -19,7 +19,10 @@ importFrom(rvest,html_node)
 importFrom(rvest,html_nodes)
 importFrom(rvest,html_text)
 importFrom(stats,aggregate)
+importFrom(stringi,stri_detect_fixed)
 importFrom(stringi,stri_enc_toascii)
+importFrom(stringi,stri_match)
+importFrom(stringi,stri_match_all)
 importFrom(stringi,stri_trans_general)
 importFrom(tibble,as_tibble)
 importFrom(tools,file_ext)
diff --git a/R/create_OSD.R b/R/create_OSD.R
index a24f85434b..1a2013dc1b 100644
--- a/R/create_OSD.R
+++ b/R/create_OSD.R
@@ -373,7 +373,11 @@ osd_to_json <- function(logfile = file.path(output_dir, "OSD/OSD.log"),
 
     x <- validateOSD(logfile, filepath)
 
-    typicalpedon <- .parseTypicalPedon(x[["TYPICAL PEDON"]])
+    parsed.OSD <- .doParseOSD(x)
+
+    # SPC-style components from parseOSD returned as nested data.frames in JSON
+    x$SITE <- I(list(parsed.OSD$`site-data`))
+    x$HORIZONS <- I(list(parsed.OSD$`hz-data`))
 
     if (is.logical(x))
       if (!x) return(FALSE)
@@ -392,7 +396,3 @@ osd_to_json <- function(logfile = file.path(output_dir, "OSD/OSD.log"),
   names(res) <- all_osds
   return(res)
 }
-
-.parseTypicalPedon <- function(section) {
-
-}
\ No newline at end of file
diff --git a/R/parseOSD_functions.R b/R/parseOSD_functions.R
index b6ac023569..962e6d0e4b 100644
--- a/R/parseOSD_functions.R
+++ b/R/parseOSD_functions.R
@@ -1,14 +1,17 @@
-### moved all OSD HTML|TXT file getting / prep to soilDB
+# parseOSD functions by dylan beaudette
 
+### moved all OSD HTML|TXT|JSON file getting to soilDB
 
-####### fulltext search support ###################
+######## fulltext search support ########
 
-# re-make entire fulltext table, containing an OSD per record
-makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') {
+
+# # re-make entire fulltext table, containing an OSD per record
+.makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') {
   # reset fulltext SQL file
-  cat('DROP TABLE osd.osd_fulltext;\n', file=outputFile)
-  cat('CREATE TABLE osd.osd_fulltext (series citext, fulltext text);\n', file=outputFile, append = TRUE)
-  cat("set client_encoding to 'latin1' ;\n", file=outputFile, append = TRUE)
+  cat('DROP TABLE osd.osd_fulltext;\n', file = outputFile)
+  cat('CREATE TABLE osd.osd_fulltext (series citext, fulltext text);\n',
+      file = outputFile, append = TRUE)
+  cat("set client_encoding to 'latin1' ;\n", file = outputFile, append = TRUE)
 
   # remove NULL elements
   idx <- which(! sapply(fullTextList, is.null))
@@ -24,33 +27,33 @@ makeFullTextTable <- function(fullTextList, outputFile='fulltext-data.sql') {
 }
 
 
-# re-make sectioned fulltext table, containing an OSD per record
-makeFullTextSectionsTable <- function(fullTextList, outputFile='fulltext-section-data.sql') {
+# # re-make sectioned fulltext table, containing an OSD per record
+.makeFullTextSectionsTable <- function(fullTextList, outputFile='fulltext-section-data.sql') {
 
   # reset fulltext SQL file
   # need to adjust fields manually as we edit
   cat('DROP TABLE osd.osd_fulltext2;\n', file='fulltext-section-data.sql')
   cat('CREATE TABLE osd.osd_fulltext2 (
-series citext,
-brief_narrative text,
-taxonomic_class text,
-typical_pedon text,
-type_location text,
-ric text,
-competing_series text,
-geog_location text,
-geog_assoc_soils text,
-drainage text,
-use_and_veg text,
-distribution text,
-remarks text,
-established text,
-additional_data text
-    );\n', file='fulltext-section-data.sql', append = TRUE)
-  cat("set client_encoding to 'latin1' ;\n", file='fulltext-section-data.sql', append = TRUE)
+        series citext,
+        brief_narrative text,
+        taxonomic_class text,
+        typical_pedon text,
+        type_location text,
+        ric text,
+        competing_series text,
+        geog_location text,
+        geog_assoc_soils text,
+        drainage text,
+        use_and_veg text,
+        distribution text,
+        remarks text,
+        established text,
+        additional_data text
+      );\n', file='fulltext-section-data.sql', append = TRUE)
+  cat("set client_encoding to 'latin1' ;\n", file = 'fulltext-section-data.sql', append = TRUE)
 
   # remove NULL elements
-  idx <- which(! sapply(fullTextList, is.null))
+  idx <- which(!sapply(fullTextList, is.null))
   fullTextList <- fullTextList[idx]
 
   # iterate over list elements and write to file
@@ -63,10 +66,8 @@ additional_data text
 }
 
 
-
-
-# convert HTML text to fulltext DB table record
-ConvertToFullTextRecord <- function(s, s.lines, tablename='osd.osd_fulltext') {
+# # convert HTML text to fulltext DB table record
+.ConvertToFullTextRecord <- function(s, s.lines, tablename='osd.osd_fulltext') {
   # collapse to single chunk
   s.text <- paste(s.lines, collapse = '\n')
   # convert into INSERT statement
@@ -76,129 +77,140 @@ ConvertToFullTextRecord <- function(s, s.lines, tablename='osd.osd_fulltext') {
   return(res)
 }
 
-
-# convert HTML text to an insert statement with data split by section
-ConvertToFullTextRecord2 <- function(s, s.lines, tablename='osd.osd_fulltext2') {
+# # convert HTML text to an insert statement with data split by section
+.ConvertToFullTextRecord2 <- function(s, s.lines, tablename='osd.osd_fulltext2') {
+  # TODO: convert to use SKB sections
   # split sections to list, section titles hard-coded
-  sections <- extractSections(s.lines)
+  sections <- .extractSections(s.lines)
 
   # get names of all sections
   st <- names(.sectionData)
 
   # combine sections with $$ quoting
-  blob <- sapply(st, function(i) {paste0('$$', sections[[i]], '$$')})
-  res <- paste0('INSERT INTO ', tablename,  ' VALUES ( $$', s, '$$, ', paste(blob, collapse = ', '), ');\n')
-  return(res)
-}
-
-
-####### fulltext search support ###################
-
-
-
-
-
-downloadParseSave <- function(i, path = 'OSD') {
-
-  # get OSD from WWW pages / HTML parsing
-  # result is a list
-  # i.lines <- try(soilDB:::.getLiveOSD(i), silent = TRUE)
-
-  # get from local OSDRegistry repository
-  # result is a list
-  i.lines <- try(
-    soilDB:::.getLocalOSD(i, path = path),
-    silent = TRUE
-  )
-
-  # no OSD...
-  if(class(i.lines) == 'try-error')
-    return(FALSE)
-
-  # output as list
-  res <- list()
-
-  # register section REGEX
-  # this sets / updates a global variable
-  setSectionREGEX(i)
-
-  # get rendered HTML->text and save to file
-  # store gzip-compressd OSD for bulk INSERT
-  res[['fulltext']] <- memCompress(ConvertToFullTextRecord(i, i.lines), type='gzip')
-
-  ## previously:
-  # cat(i.fulltext, file = 'fulltext-data.sql', append = TRUE)
-
-  # split data into sections for fulltext search, catch errors related to parsing sections
-  i.sections <- try(ConvertToFullTextRecord2(i, i.lines))
-  if(class(i.sections) != 'try-error') {
-
-    # store gzip-compressed sections for bulk INSERT
-    res[['sections']] <- memCompress(i.sections, type='gzip')
-
-    ## previously:
-    # cat(i.sections, file = 'fulltext-section-data.sql', append = TRUE)
-  }
-
-
-  # append hz data to our list, catch errors related to parsing sections
-  hz.data <- try(extractHzData(i.lines), silent = TRUE)
-
-  # append site data to our list, catch errors related to parsing sections
-  section.data <- try(extractSections(i.lines), silent = TRUE)
-  site.data <- try(extractSiteData(section.data), silent = TRUE)
-
-  # append if result was a data.frame
-  if(class(hz.data) == 'data.frame') {
-    # add seriesname to final column
-    hz.data$seriesname <- i
-    res[['hz']] <- hz.data
-  }
-
-  # append if result was a data.frame
-  if(class(site.data) == 'data.frame') {
-    # add seriesname to final column
-    site.data$seriesname <- i
-    res[['site']] <- site.data
-  }
-
+  blob <- sapply(st, function(i) {
+    paste0('$$', sections[[i]], '$$')
+  })
+  res <- paste0('INSERT INTO ', tablename,  ' VALUES ( $$',
+                s, '$$, ', paste(blob, collapse = ', '), ');\n')
   return(res)
 }
 
-# use this approach to make the wrapper "safe"
-downloadParseSave.safe <- purrr::safely(downloadParseSave)
-
-
-testIt <- function(x) {
-  # get data
-  res <- soilDB:::.getLiveOSD(x)
-
-  # init section REGEX: critical for locating brief narrative
-  setSectionREGEX(x)
 
-  # extract sections
-  l <- list()
-  l[['sections']] <- extractSections(res)
-  l[['section-indices']] <- findSectionIndices(res)
-  l[['site-data']] <- extractSiteData(l[['sections']])
-  l[['hz-data']] <- extractHzData(res)
-
-  return(l)
-}
-
-testItLocal <- function(x, path = 'OSD') {
-  # get data
-  res <- soilDB:::.getLocalOSD(x, path)
-
-  # init section REGEX: critical for locating brief narrative
-  setSectionREGEX(x)
+######## deprecated: OSD downloading ########
+
+
+# TODO: artifacts for gzipped fulltext search product
+
+# downloadParseSave <- function(i, path = 'OSD') {
+#
+#   # get OSD from WWW pages / HTML parsing
+#   # result is a list
+#   # i.lines <- try(soilDB:::.getLiveOSD(i), silent = TRUE)
+#
+#   # get from local OSDRegistry repository
+#   # result is a list
+#   i.lines <- try(
+#     soilDB:::.getLocalOSD(i, path = path),
+#     silent = TRUE
+#   )
+#
+#   # no OSD...
+#   if(class(i.lines) == 'try-error')
+#     return(FALSE)
+#
+#   # output as list
+#   res <- list()
+#
+#   # register section REGEX
+#   # this sets / updates a global variable
+#   .setSectionREGEX(i)
+#
+#   # get rendered HTML->text and save to file
+#   # store gzip-compressd OSD for bulk INSERT
+#   res[['fulltext']] <- memCompress(.ConvertToFullTextRecord(i, i.lines), type='gzip')
+#
+#   ## previously:
+#   # cat(i.fulltext, file = 'fulltext-data.sql', append = TRUE)
+#
+#   # split data into sections for fulltext search, catch errors related to parsing sections
+#   i.sections <- try(.ConvertToFullTextRecord2(i, i.lines))
+#   if(class(i.sections) != 'try-error') {
+#
+#     # store gzip-compressed sections for bulk INSERT
+#     res[['sections']] <- memCompress(i.sections, type='gzip')
+#
+#     ## previously:
+#     # cat(i.sections, file = 'fulltext-section-data.sql', append = TRUE)
+#   }
+#
+#
+#   # append hz data to our list, catch errors related to parsing sections
+#   hz.data <- try(.extractHzData(i.lines), silent = TRUE)
+#
+#   # append site data to our list, catch errors related to parsing sections
+#   section.data <- try(.extractSections(i.lines), silent = TRUE)
+#   site.data <- try(.extractSiteData(section.data), silent = TRUE)
+#
+#   # append if result was a data.frame
+#   if(class(hz.data) == 'data.frame') {
+#     # add seriesname to final column
+#     hz.data$seriesname <- i
+#     res[['hz']] <- hz.data
+#   }
+#
+#   # append if result was a data.frame
+#   if(class(site.data) == 'data.frame') {
+#     # add seriesname to final column
+#     site.data$seriesname <- i
+#     res[['site']] <- site.data
+#   }
+#
+#   return(res)
+# }
+#
+# # use this approach to make the wrapper "safe"
+# downloadParseSave.safe <- purrr::safely(downloadParseSave)
+#
+#
+# testIt <- function(x) {
+#   # get data
+#   res <- soilDB:::.getLiveOSD(x)
+#
+#   # init section REGEX: critical for locating brief narrative
+#   .setSectionREGEX(x)
+#
+#   # extract sections
+#   l <- list()
+#   l[['sections']] <- .extractSections(res)
+#   l[['section-indices']] <- .findSectionIndices(res)
+#   l[['site-data']] <- .extractSiteData(l[['sections']])
+#   l[['hz-data']] <- .extractHzData(res)
+#
+#   return(l)
+# }
+
+######## OSD parsing ########
+
+#' Prepare Site and Horizon _data.frame_ from a `validateOSD()` result
+#' @param x a _list_ result of SoilKnowledgeBase::validateOSD()
+.doParseOSD <- function(x) {
+  # # get data
+  # res <- soilDB:::.getLocalOSD(x, path)
+  #
+  # # init section REGEX: critical for locating brief narrative
+  # .setSectionREGEX(x)
+  #
+  # # extract sections
+  # l[['sections']] <- .extractSections(res)
+  # l[['section-indices']] <- .findSectionIndices(res)
 
-  # extract sections
   l <- list()
-  l[['sections']] <- extractSections(res)
-  l[['section-indices']] <- findSectionIndices(res)
-  l[['site-data']] <- extractSiteData(l[['sections']])
-  l[['hz-data']] <- extractHzData(res)
+  l[['site-data']] <- .extractSiteData(x)
+  tp <- strsplit(as.character(x$`TYPICAL PEDON`$content), "\n")
+  if (length(tp) > 0)
+    l[['hz-data']] <- .extractHzData(tp[[1]])
+  else
+    l[['hz-data']] <- data.frame(name = NA)
 
   return(l)
 }
@@ -207,9 +219,10 @@ testItLocal <- function(x, path = 'OSD') {
 ## safely (efficiently?) find specific classes within a vector of narratives
 # needle: class labels
 # haystack: narrative by horizon
-findClass <- function(needle, haystack) {
+#' @importFrom stringi stri_detect_fixed
+.findClass <- function(needle, haystack) {
   # iterate over vector of horizon narratives, searching for exact matches
-  test.by.hz <- lapply(haystack, stri_detect_fixed, pattern = needle, opts_fixed = list(case_insensitive=TRUE))
+  test.by.hz <- lapply(haystack, stringi::stri_detect_fixed, pattern = needle, opts_fixed = list(case_insensitive = TRUE))
 
   # iterate over search results by horizon, keeping names of matching classes
   matches <- lapply(test.by.hz, function(i) {
@@ -217,7 +230,7 @@ findClass <- function(needle, haystack) {
   })
 
   # compute number of characters: longer matches are the most specific / correct
-  res <- map_chr(matches, function(i) {
+  res <- sapply(matches, function(i) {
 
     # find the longest matching string
     idx <- which.max(nchar(i))
@@ -239,7 +252,7 @@ findClass <- function(needle, haystack) {
 
 
 # vectorized parsing of texture class from OSD
-parse_texture <- function(text) {
+.parse_texture <- function(text) {
   # mineral texture classes, sorted from coarse -> fine
   textures <- c('coarse sand', 'sand', 'fine sand', 'very fine sand', 'loamy coarse sand', 'loamy sand', 'loamy fine sand', 'loamy very fine sand', 'coarse sandy loam', 'sandy loam', 'fine sandy loam', 'very fine sandy loam', 'loam', 'silt loam', 'silt', 'sandy clay loam', 'clay loam', 'silty clay loam', 'sandy clay', 'silty clay', 'clay')
 
@@ -260,21 +273,23 @@ parse_texture <- function(text) {
   # m <- tolower(m[, 2])
 
   ## 2019-05-29: generalized for all non-greedy, exact matching
-  m <- findClass(needle=textures, haystack=text)
+  m <- .findClass(needle = textures, haystack = text)
   m <- tolower(m)
 
   # convert to ordered factor
-  m <- factor(m, levels=textures, ordered = TRUE)
+  m <- factor(m, levels = textures, ordered = TRUE)
 
   return(m)
 }
 
+
 # vectorized parsing of horizon boundary
-parse_hz_boundary <- function(text) {
+#' @importFrom stringi stri_match
+.parse_hz_boundary <- function(text) {
 
   distinctness <- c('very abrupt', 'abrupt', 'clear', 'gradual', 'diffuse')
   topography <- c('smooth', 'wavy', 'irregular', 'broken')
-  bdy <- apply(expand.grid(distinctness, topography), 1, paste, collapse=' ')
+  bdy <- apply(expand.grid(distinctness, topography), 1, paste, collapse = ' ')
 
 
   ## TODO: this is too greedy ?
@@ -284,7 +299,7 @@ parse_hz_boundary <- function(text) {
   bdy.regex <- paste0('(', paste(bdy, collapse='|'), ') boundary')
 
   # get matches
-  m <- stri_match(text, regex = bdy.regex, mode='first', opts_regex=list(case_insensitive=TRUE))
+  m <- stringi::stri_match(text, regex = bdy.regex, mode = 'first', opts_regex = list(case_insensitive = TRUE))
 
   # fail gracefully in the case of no section data or no matches
   if(nrow(m) < 1)
@@ -295,29 +310,31 @@ parse_hz_boundary <- function(text) {
 
   # split into pieces
   res <- data.frame(
-    distinctness = findClass(needle=distinctness, haystack=m),
-    topography = findClass(needle=topography, haystack=m),
+    distinctness = .findClass(needle = distinctness, haystack = m),
+    topography = .findClass(needle = topography, haystack = m),
     stringsAsFactors = FALSE
   )
 
   return(res)
 }
 
+
 # vectorized parsing of coarse fraction qty+class from OSD
-parse_CF <- function(text) {
+#' @importFrom stringi stri_match
+.parse_CF <- function(text) {
   cf.type <- c('gravelly', 'cobbly', 'stony', 'bouldery', 'channery', 'flaggy')
   cf.qty <- c('', 'very ', 'extremely ')
-  cf <- apply(expand.grid(cf.qty, cf.type), 1, paste, collapse='')
+  cf <- apply(expand.grid(cf.qty, cf.type), 1, paste, collapse = '')
 
 
   ## TODO: this is too greedy as 'fine sand' will be found _within_ 'fine sandy loam'
   # https://github.com/dylanbeaudette/parse-osd/issues/10
 
   # combine into capturing REGEX
-  cf.regex <- paste0('(', paste(cf, collapse='|'), ')')
+  cf.regex <- paste0('(', paste(cf, collapse = '|'), ')')
 
   # get matches
-  m <- stri_match(text, regex = cf.regex, mode='first', opts_regex=list(case_insensitive=TRUE))
+  m <- stringi::stri_match(text, regex = cf.regex, mode = 'first', opts_regex = list(case_insensitive = TRUE))
 
   # fail gracefully in the case of no section data or no matches
   if(nrow(m) < 1)
@@ -330,13 +347,14 @@ parse_CF <- function(text) {
 }
 
 # vectorized parsing of pH
-parse_pH <- function(text) {
+#' @importFrom stringi stri_match
+.parse_pH <- function(text) {
 
   # combine into capturing REGEX
   ph.regex <- '\\(ph\\s?([0-9]\\.[0-9])\\)'
 
   # get matches
-  m <- stri_match(text, regex = ph.regex, mode='first', opts_regex=list(case_insensitive=TRUE))
+  m <- stringi::stri_match(text, regex = ph.regex, mode = 'first', opts_regex = list(case_insensitive = TRUE))
 
   # fail gracefully in the case of no section data or no matches
   if(nrow(m) < 1)
@@ -350,24 +368,26 @@ parse_pH <- function(text) {
 
 
 # vectorized parsing of pH class
-parse_pH_class <- function(text) {
+#' @importFrom stringi stri_match
+.parse_pH_class <- function(text) {
 
   # mineral texture classes
   pH_classes <- c('ultra acid', 'extremely acid', 'very strongly acid', 'strongly acid', 'moderately acid', 'slightly acid', 'neutral', 'slightly alkaline', 'mildly alkaline', 'moderately alkaline', 'strongly alkaline', 'very strongly alkaline')
 
   ## 2019-05-29: generalized for all non-greedy, exact matching
-  m <- findClass(needle=pH_classes, haystack=text)
+  m <- .findClass(needle = pH_classes, haystack = text)
   m <- tolower(m)
 
   # return as an ordered factor acidic -> basic
-  m <- factor(m, levels=pH_classes, ordered = TRUE)
+  m <- factor(m, levels = pH_classes, ordered = TRUE)
 
   return(m)
 }
 
 
 # vectorized parsing of drainage class
-parse_drainage_class <- function(text) {
+#' @importFrom stringi stri_match
+.parse_drainage_class <- function(text) {
 
   # drainage classes, in order, lower case
   classes <- c("excessively", "somewhat excessively", "well", "moderately well",
@@ -378,10 +398,10 @@ parse_drainage_class <- function(text) {
 
 
   # combine into capturing REGEX
-  classes.regex <- paste0('(', paste(classes, collapse='|'), ')')
+  classes.regex <- paste0('(', paste(classes, collapse = '|'), ')')
 
   # get matches
-  m <- stri_match(text, regex = classes.regex, mode='first', opts_regex=list(case_insensitive=TRUE))
+  m <- stringi::stri_match(text, regex = classes.regex, mode = 'first', opts_regex = list(case_insensitive = TRUE))
 
   # fail gracefully in the case of no section data or no matches
   if(nrow(m) < 1)
@@ -391,20 +411,17 @@ parse_drainage_class <- function(text) {
   m <- tolower(m[, 2])
 
   # return as an ordered factor acidic -> basic
-  m <- factor(m, levels=classes, ordered = TRUE)
+  m <- factor(m, levels = classes, ordered = TRUE)
 
   return(m)
 }
 
-
-
-### TODO: test / move into aqp ###
-
+######## deprecated: section parsing methods ########
 
 ## this has to be done before any extraction is possible
 # init the section names and REGEX search paterns
 # the current series name is required for the top-most section
-setSectionREGEX <- function(s) {
+.setSectionREGEX <- function(s) {
   ## temporary hack: storing as a global variable
   # values are REGEX that try to accommodate typos
   # names are the proper section names
@@ -432,12 +449,8 @@ setSectionREGEX <- function(s) {
 
 }
 
-
-
-
-
-# locate section line numbers
-findSectionIndices <- function(chunk.lines) {
+# # locate section line numbers
+.findSectionIndices <- function(chunk.lines) {
   # result is a list, sometimes a section REGEX will match multiple lines
   s <- lapply(.sectionData, function(st) grep(st, chunk.lines, ignore.case = TRUE))
 
@@ -454,14 +467,15 @@ findSectionIndices <- function(chunk.lines) {
   return(indices)
 }
 
-# extract sections from lines of OSD
-extractSections <- function(chunk.lines, collapseLines=TRUE) {
+# # extract sections from lines of OSD
+.extractSections <- function(chunk.lines, collapseLines = TRUE) {
+
   # storage
   l <- list()
 
   # locate section lines
   # note: this will give values inclusive of the next section
-  section.locations <- findSectionIndices(chunk.lines)
+  section.locations <- .findSectionIndices(chunk.lines)
   section.names <- names(section.locations)
 
   # combine chunks into a list
@@ -472,10 +486,10 @@ extractSections <- function(chunk.lines, collapseLines=TRUE) {
     start.line <- section.locations[i]
 
     # this stop line overlaps with the start of the next, decrease index by 1
-    stop.line <- section.locations[i+1] - 1
+    stop.line <- section.locations[i + 1] - 1
 
     # extract current chunk
-    chunk <- chunk.lines[start.line : stop.line]
+    chunk <- chunk.lines[start.line:stop.line]
 
     # special case #1: the brief narrative is split over two lines; first line is junk
     if(this.name == 'BRIEF DESCRIPTION' & length(chunk) > 1) {
@@ -485,7 +499,7 @@ extractSections <- function(chunk.lines, collapseLines=TRUE) {
 
     # optionally combine lines
     if(collapseLines)
-      chunk <- paste(chunk, collapse='')
+      chunk <- paste(chunk, collapse = '')
 
     # attempt to remove section name
     chunk <- gsub(this.name, '', chunk)
@@ -497,41 +511,37 @@ extractSections <- function(chunk.lines, collapseLines=TRUE) {
   return(l)
 }
 
-
-
-
-
+######## extract SPC-style data.frames ########
 
 # parse important pieces from sections
 # x: list of section chunks
-extractSiteData <- function(x) {
+.extractSiteData <- function(x) {
 
   ## drainage class
 
   # this work for standard OSD format
-  drainage.class <- parse_drainage_class(x[['DRAINAGE AND PERMEABILITY']])
+  drainage.class <- .parse_drainage_class(x[['DRAINAGE.AND.PERMEABILITY']])
 
   # alternative for SSR1 updated OSD format
   # https://casoilresource.lawr.ucdavis.edu/sde/?series=bordengulch
   if(is.na(drainage.class)) {
-    drainage.class <- parse_drainage_class(x[['BRIEF DESCRIPTION']])
+    drainage.class <- .parse_drainage_class(x[['OVERVIEW']])
   }
 
 
-  ## other things?
+  ## TODO: other things?
 
 
   # composite into a list for later
-  r <- data.frame(drainage=drainage.class)
+  r <- data.frame(drainage = drainage.class)
   return(r)
 }
 
 
 # s.lines: result of getOSD()
+#' @importFrom stringi stri_match_all
 .extractHzData <- function(tp) {
-  options(stringsAsFactors=FALSE)
 
-  ## TODO: this is kind of wasteful
   # this will not work in the presence of typos
   # new code for splitting blocks by section, lines from each section are not joined
   # sections <- extractSections(s.lines, collapseLines = FALSE)
@@ -608,11 +618,11 @@ extractSiteData <- function(x) {
     # parse hz designations and depths, keep first match
     ## hack
     # first try to find horizons with top AND bottom depths
-    h <- stri_match(this.chunk, regex=hz.rule)
+    h <- stringi::stri_match(this.chunk, regex = hz.rule)
     # if none, then try searching for only top depths
     if(all(is.na(h))) {
       # this won't have the correct number of elements, adjust manually
-      h <- stri_match(this.chunk, regex=hz.rule.no.bottom)
+      h <- stringi::stri_match(this.chunk, regex = hz.rule.no.bottom)
       h <- c(h, h[4]) # move units to 5th element
       h[4] <- NA # add fake missing bottom depth
     }
@@ -625,7 +635,7 @@ extractSiteData <- function(x) {
 
     ## TODO: test this!
     # parse ALL colors, result is a multi-row matrix, 5th column is moisture state
-    colors <- stri_match_all(this.chunk, regex=color.rule)[[1]]
+    colors <- stringi::stri_match_all(this.chunk, regex=color.rule)[[1]]
     # replace missing moisture state with (parsed) default value
     colors[, 5][which(colors[, 5] == '')] <- default.moisture.state
 
@@ -650,10 +660,10 @@ extractSiteData <- function(x) {
     return(NULL)
 
   # convert to DF
-  hz.data <- ldply(hz.data)[2:5]
-  dry.colors <- ldply(dry.colors)[2:4]
-  moist.colors <- ldply(moist.colors)[2:4]
-  narrative.data <- ldply(narrative.data)
+  hz.data <- as.data.frame(do.call('rbind', hz.data))[2:5]
+  dry.colors <- as.data.frame(do.call('rbind', dry.colors))[2:4]
+  moist.colors <- as.data.frame(do.call('rbind', moist.colors))[2:4]
+  narrative.data <- as.data.frame(do.call('rbind', narrative.data))
 
   names(hz.data) <- c('name', 'top', 'bottom', 'units')
   names(dry.colors) <- c('dry_hue', 'dry_value', 'dry_chroma')
@@ -670,7 +680,7 @@ extractSiteData <- function(x) {
   moist.colors$moist_value <- as.numeric(moist.colors$moist_value)
   moist.colors$moist_chroma <- as.numeric(moist.colors$moist_chroma)
 
-  ## TODO sanity check / unit reporting: this will fail when formatting is inconsistent (PROPER series)
+  ## TODO: sanity check / unit reporting: this will fail when formatting is inconsistent (PROPER series)
   # convert in -> cm using the first horizon
   if(hz.data$units[1] %in% c('inches', 'in')) {
     hz.data$top <- round(hz.data$top * 2.54)
@@ -684,12 +694,12 @@ extractSiteData <- function(x) {
   res <- cbind(hz.data, dry.colors, moist.colors)
 
   # parse out other elements from the narrative
-  res$texture_class <- parse_texture(narrative.data$narrative)
-  res$cf_class <- parse_CF(narrative.data$narrative)
-  res$pH <- parse_pH(narrative.data$narrative)
-  res$pH_class <- parse_pH_class(narrative.data$narrative)
+  res$texture_class <- .parse_texture(narrative.data$narrative)
+  res$cf_class <- .parse_CF(narrative.data$narrative)
+  res$pH <- .parse_pH(narrative.data$narrative)
+  res$pH_class <- .parse_pH_class(narrative.data$narrative)
 
-  bdy <- parse_hz_boundary(narrative.data$narrative)
+  bdy <- .parse_hz_boundary(narrative.data$narrative)
   res$distinctness <- bdy$distinctness
   res$topography <- bdy$topography