From 21a473705dc4250faca8f78e34f0d2d2c4d5a72d Mon Sep 17 00:00:00 2001 From: Tan Date: Fri, 29 Dec 2023 23:55:37 -0500 Subject: [PATCH] improve pb_read and pb_write documentation --- R/pb_read.R | 33 +++++++++++++++++++++++++++------ R/pb_write.R | 35 +++++++++++++++++++++++++++++------ man/guess_read_function.Rd | 37 +++++++++++++++++++++++++++++++++++++ man/guess_write_function.Rd | 37 +++++++++++++++++++++++++++++++++++++ man/pb_download_url.Rd | 29 ++++++++++++++++++++++++++--- man/pb_read.Rd | 18 ++++++++++++++---- man/pb_write.Rd | 17 ++++++++++++++--- 7 files changed, 184 insertions(+), 22 deletions(-) create mode 100644 man/guess_read_function.Rd create mode 100644 man/guess_write_function.Rd diff --git a/R/pb_read.R b/R/pb_read.R index be61afd..62d8c96 100644 --- a/R/pb_read.R +++ b/R/pb_read.R @@ -3,19 +3,23 @@ #' A convenience wrapper around writing an object to a temporary file and then #' uploading to a specified repo/release. This convenience comes at a cost to #' performance efficiency, since it first downloads the data to disk and then -#' reads the data from disk into memory. See `vignette("duckdb_arrow")` for +#' reads the data from disk into memory. See `vignette("cloud_native")` for #' alternative ways to bypass this flow and work with the data directly. #' #' @param file string: file name #' @param repo string: GH repository name in format "owner/repo". Default #' `guess_repo()` tries to guess based on current working directory's git repo #' @param tag string: tag for the GH release, defaults to "latest" -#' @param read_function function: specifies how to read in the data. Default -#' tries to guess a function based on file extension (csv, rds, parquet, txt, json) -#' @param ... additional arguments passed to `read_function` +#' @param read_function function: used to read in the data, where the file is +#' passed as the first argument and any additional arguments are subsequently +#' passed in via `...`. Default `guess_read_function(file)` will check the file +#' extension and try to find an appropriate read function if the extension is one +#' of rds, csv, tsv, parquet, txt, or json, and will abort if not found. +#' @param ... additional arguments passed to `read_function` after file #' @param .token GitHub authentication token, see [gh::gh_token()] #' #' @export +#' @family pb_rw #' #' @return Result of reading in the file in question. #' @examples \donttest{ @@ -50,6 +54,23 @@ pb_read <- function(file, read_function(file.path(tempdir(), file), ...) } +#' Guess read function from file extension +#' +#' This function accepts a filename and tries to return a valid function for +#' reading it. +#' +#' `guess_read_function` understands the following file extensions: +#' - rds with `readRDS` +#' - csv, csv.gz, csv.xz with `utils::read.csv` +#' - tsv, tsv.gz, tsv.xz with `utils::read.delim` +#' - parquet with `arrow::read_parquet` +#' - txt, txt.gz, txt.xz with `readLines` +#' - json, json.gz, json.xz with `jsonlite::fromJSON` +#' +#' @family pb_rw +#' @param file filename to parse +#' @return function for reading the file, if found +#' @keywords internal guess_read_function <- function(file){ file_ext <- tools::file_ext(gsub(x = file, pattern = ".gz$|.xz$", replacement = "")) if (file_ext == "parquet") rlang::check_installed("arrow") @@ -57,8 +78,8 @@ guess_read_function <- function(file){ read_fn <- switch( file_ext, "rds" = readRDS, - "csv" = read.csv, - "tsv" = read.delim, + "csv" = utils::read.csv, + "tsv" = utils::read.delim, "parquet" = arrow::read_parquet, "txt" = readLines, "json" = jsonlite::fromJSON, diff --git a/R/pb_write.R b/R/pb_write.R index ff9019e..60b24e1 100644 --- a/R/pb_write.R +++ b/R/pb_write.R @@ -8,21 +8,26 @@ #' @param repo string: GH repository name in format "owner/repo". Default #' `guess_repo()` tries to guess based on current working directory's git repo #' @param tag string: tag for the GH release, defaults to "latest" -#' @param write_function function: specifies how to read in the data. Default -#' tries to guess a function based on file extension (csv, rds, txt, parquet, json) +#' @param write_function function: used to write an R object to file, where the +#' object is passed as the first argument, the filename as the second argument, +#' and any additional arguments are subsequently passed in via `...`. Default +#' `guess_write_function(file)` will check the file extension and try to find an +#' appropriate write function if the extension is one of rds, csv, tsv, parquet, +#' txt, or json, and will abort if not found. #' @param ... additional arguments passed to `write_function` #' @param .token GitHub authentication token, see [gh::gh_token()] #' #' @export +#' @family pb_rw #' #' @return Writes file to release and returns github API response #' @examples \donttest{ -#' if (interactive()) { +#' \dontshow{if (interactive()) \{} #' pb_write(mtcars, "mtcars.rds", repo = "tanho63/piggyback-tests") #' #> ℹ Uploading to latest release: "v0.0.2". #' #> ℹ Uploading mtcars.rds ... #' #> |===============================================================| 100% -#' } +#' \dontshow{\}} #'} pb_write <- function(x, file, @@ -43,6 +48,23 @@ pb_write <- function(x, pb_upload(destfile, repo = repo, tag = tag, .token = .token) } +#' Guess write function from file extension +#' +#' This function accepts a filename and tries to return a valid function for +#' writing to it. +#' +#' `guess_write_function` understands the following file extensions: +#' - rds with `saveRDS` +#' - csv, csv.gz, csv.xz with `utils::write.csv` +#' - tsv, tsv.gz, tsv.xz with a modified `utils::write.csv` where sep is set to `"\t"` +#' - parquet with `arrow::write_parquet` +#' - txt, txt.gz, txt.xz with `writeLines` +#' - json, json.gz, json.xz with `jsonlite::write_json` +#' +#' @family pb_rw +#' @param file filename to parse +#' @return function for reading the file, if found +#' @keywords internal guess_write_function <- function(file){ file_ext <- tools::file_ext(gsub(x = file, pattern = ".gz$|.xz$", replacement = "")) if (file_ext == "parquet") rlang::check_installed("arrow") @@ -50,10 +72,11 @@ guess_write_function <- function(file){ write_fn <- switch( file_ext, "rds" = saveRDS, - "csv" = write.csv, + "csv" = utils::write.csv, + "tsv" = function(x, file, ..., sep = "\t") utils::write.csv(x = x, file = file, sep = sep, ...), "txt" = writeLines, "parquet" = arrow::write_parquet, - "json" = jsonlite::toJSON, + "json" = jsonlite::write_json, cli::cli_abort("File type {.val {file_ext}} is not recognized, please provide a {.arg write_function}") ) diff --git a/man/guess_read_function.Rd b/man/guess_read_function.Rd new file mode 100644 index 0000000..34480de --- /dev/null +++ b/man/guess_read_function.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pb_read.R +\name{guess_read_function} +\alias{guess_read_function} +\title{Guess read function from file extension} +\usage{ +guess_read_function(file) +} +\arguments{ +\item{file}{filename to parse} +} +\value{ +function for reading the file, if found +} +\description{ +This function accepts a filename and tries to return a valid function for +reading it. +} +\details{ +\code{guess_read_function} understands the following file extensions: +\itemize{ +\item rds with \code{readRDS} +\item csv, csv.gz, csv.xz with \code{utils::read.csv} +\item tsv, tsv.gz, tsv.xz with \code{utils::read.delim} +\item parquet with \code{arrow::read_parquet} +\item txt, txt.gz, txt.xz with \code{readLines} +\item json, json.gz, json.xz with \code{jsonlite::fromJSON} +} +} +\seealso{ +Other pb_rw: +\code{\link{guess_write_function}()}, +\code{\link{pb_read}()}, +\code{\link{pb_write}()} +} +\concept{pb_rw} +\keyword{internal} diff --git a/man/guess_write_function.Rd b/man/guess_write_function.Rd new file mode 100644 index 0000000..72ee984 --- /dev/null +++ b/man/guess_write_function.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pb_write.R +\name{guess_write_function} +\alias{guess_write_function} +\title{Guess write function from file extension} +\usage{ +guess_write_function(file) +} +\arguments{ +\item{file}{filename to parse} +} +\value{ +function for reading the file, if found +} +\description{ +This function accepts a filename and tries to return a valid function for +writing to it. +} +\details{ +\code{guess_write_function} understands the following file extensions: +\itemize{ +\item rds with \code{saveRDS} +\item csv, csv.gz, csv.xz with \code{utils::write.csv} +\item tsv, tsv.gz, tsv.xz with a modified \code{utils::write.csv} where sep is set to \code{"\\t"} +\item parquet with \code{arrow::write_parquet} +\item txt, txt.gz, txt.xz with \code{writeLines} +\item json, json.gz, json.xz with \code{jsonlite::write_json} +} +} +\seealso{ +Other pb_rw: +\code{\link{guess_read_function}()}, +\code{\link{pb_read}()}, +\code{\link{pb_write}()} +} +\concept{pb_rw} +\keyword{internal} diff --git a/man/pb_download_url.Rd b/man/pb_download_url.Rd index 0172211..757d10d 100644 --- a/man/pb_download_url.Rd +++ b/man/pb_download_url.Rd @@ -39,11 +39,34 @@ functions that are able to accept URLs. \donttest{ \dontshow{try(\{} -# returns browser url by default -pb_download_url("iris.tsv.xz", repo = "cboettig/piggyback-tests", tag = "v0.0.1") +# returns browser url by default (and all files if none are specified) +browser_url <- pb_download_url( + repo = "tanho63/piggyback-tests", + tag = "v0.0.2" + ) +print(browser_url) +utils::read.csv(browser_url[[1]]) # can return api url if desired -pb_download_url("iris.tsv.xz", repo = "cboettig/piggyback-tests", tag = "v0.0.1", url_type = "api") +api_url <- pb_download_url( + "mtcars.csv", + repo = "tanho63/piggyback-tests", + tag = "v0.0.2" + ) +print(api_url) + +# for public repositories, this will still work +utils::read.csv(api_url) + +# for private repos, can use httr or curl to fetch and then pass into read function +gh_pat <- Sys.getenv("GITHUB_PAT") + +if(!identical(gh_pat, "")){ + resp <- httr::GET(api_url, httr::add_headers(Authorization = paste("Bearer", gh_pat))) + utils::read.csv(text = httr::content(resp, as = "text")) +} + +# or use pb_read which bundles some of this for you \dontshow{\})} } diff --git a/man/pb_read.Rd b/man/pb_read.Rd index aad97db..7b8bd2c 100644 --- a/man/pb_read.Rd +++ b/man/pb_read.Rd @@ -21,10 +21,13 @@ pb_read( \item{tag}{string: tag for the GH release, defaults to "latest"} -\item{read_function}{function: specifies how to read in the data. Default -tries to guess a function based on file extension (csv, rds, parquet, txt, json)} +\item{read_function}{function: used to read in the data, where the file is +passed as the first argument and any additional arguments are subsequently +passed in via \code{...}. Default \code{guess_read_function(file)} will check the file +extension and try to find an appropriate read function if the extension is one +of rds, csv, tsv, parquet, txt, or json, and will abort if not found.} -\item{...}{additional arguments passed to \code{read_function}} +\item{...}{additional arguments passed to \code{read_function} after file} \item{.token}{GitHub authentication token, see \code{\link[gh:gh_token]{gh::gh_token()}}} } @@ -35,7 +38,7 @@ Result of reading in the file in question. A convenience wrapper around writing an object to a temporary file and then uploading to a specified repo/release. This convenience comes at a cost to performance efficiency, since it first downloads the data to disk and then -reads the data from disk into memory. See \code{vignette("duckdb_arrow")} for +reads the data from disk into memory. See \code{vignette("cloud_native")} for alternative ways to bypass this flow and work with the data directly. } \examples{ @@ -45,3 +48,10 @@ try({ # try block is to avoid CRAN issues and is not required in ordinary usage }) } } +\seealso{ +Other pb_rw: +\code{\link{guess_read_function}()}, +\code{\link{guess_write_function}()}, +\code{\link{pb_write}()} +} +\concept{pb_rw} diff --git a/man/pb_write.Rd b/man/pb_write.Rd index bec9ec6..07b198d 100644 --- a/man/pb_write.Rd +++ b/man/pb_write.Rd @@ -24,8 +24,12 @@ pb_write( \item{tag}{string: tag for the GH release, defaults to "latest"} -\item{write_function}{function: specifies how to read in the data. Default -tries to guess a function based on file extension (csv, rds, txt, parquet, json)} +\item{write_function}{function: used to write an R object to file, where the +object is passed as the first argument, the filename as the second argument, +and any additional arguments are subsequently passed in via \code{...}. Default +\code{guess_write_function(file)} will check the file extension and try to find an +appropriate write function if the extension is one of rds, csv, tsv, parquet, +txt, or json, and will abort if not found.} \item{...}{additional arguments passed to \code{write_function}} @@ -40,11 +44,18 @@ uploading to a specified repo/release. } \examples{ \donttest{ -if (interactive()) { +\dontshow{if (interactive()) \{} pb_write(mtcars, "mtcars.rds", repo = "tanho63/piggyback-tests") #> ℹ Uploading to latest release: "v0.0.2". #> ℹ Uploading mtcars.rds ... #> |===============================================================| 100\% +\dontshow{\}} } } +\seealso{ +Other pb_rw: +\code{\link{guess_read_function}()}, +\code{\link{guess_write_function}()}, +\code{\link{pb_read}()} } +\concept{pb_rw}