diff --git a/.gitignore b/.gitignore index 4a03414..bf695ed 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ data/* .DS_Store CRAN_SUBMISSION CRAN_RELEASE +docs/* diff --git a/DESCRIPTION b/DESCRIPTION index edbb42f..8642924 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,16 +1,10 @@ Package: piggyback Version: 0.1.5.9004 Title: Managing Larger Data on a GitHub Repository -Description: Because larger (> 50 MB) data files cannot easily be committed to git, - a different approach is required to manage data associated with an analysis in a - GitHub repository. This package provides a simple work-around by allowing larger - (up to 2 GB) data files to piggyback on a repository as assets attached to individual - GitHub releases. These files are not handled by git in any way, but instead are - uploaded, downloaded, or edited directly by calls through the GitHub API. These - data files can be versioned manually by creating different releases. This approach - works equally well with public or private repositories. Data can be uploaded - and downloaded programmatically from scripts. No authentication is required to - download data from public repositories. +Description: Helps store files as GitHub release assets, which is a convenient + way for large/binary data files to piggyback onto public and private GitHub + repositories. Includes functions for file downloads, uploads, and managing + releases via the GitHub API. Authors@R: c(person("Carl", "Boettiger", email = "cboettig@gmail.com", role = c("aut", "cre", "cph"), @@ -43,9 +37,8 @@ Imports: memoise, rlang Suggests: + arrow, spelling, - duckdbfs, - duckdb, readr, covr, testthat, diff --git a/NAMESPACE b/NAMESPACE index ddd6b6d..3d2133d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,9 @@ export(pb_download) export(pb_download_url) export(pb_list) export(pb_new_release) +export(pb_read) export(pb_release_create) export(pb_release_delete) export(pb_releases) export(pb_upload) +export(pb_write) diff --git a/NEWS.md b/NEWS.md index 4761470..1958515 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ before trying API download URLs. This should reduce/eliminate effect of API rate limits for pb_download. [#109] * `"latest"` release now aligns with GitHub's "latest" release definition [#113] * `pb_download_url()` now can return choice of "browser" or "api" download URLs [#116] +* Add new functions `pb_read()` and `pb_write()` as convenience wrappers around +pattern of downloading to `tempfile()` and then reading into memory. [#97] # piggyback 0.1.5 diff --git a/R/pb_download_url.R b/R/pb_download_url.R index 79dce69..43a646b 100644 --- a/R/pb_download_url.R +++ b/R/pb_download_url.R @@ -70,7 +70,7 @@ pb_download_url <- function(file = NULL, file <- file[file %in% df$file_name] } - if(length(file) == 0) return(cli::cli_abort("No download URLs to return.")) + if(length(file) == 0) return(cli::cli_abort("No download URLs found")) switch( url_type, diff --git a/R/pb_read.R b/R/pb_read.R new file mode 100644 index 0000000..79e6baf --- /dev/null +++ b/R/pb_read.R @@ -0,0 +1,90 @@ +#' Read one file into memory +#' +#' A convenience wrapper around writing an object to a temporary file and then +#' uploading to a specified repo/release. This convenience comes at a cost to +#' performance efficiency, since it first downloads the data to disk and then +#' reads the data from disk into memory. See `vignette("cloud_native")` for +#' alternative ways to bypass this flow and work with the data directly. +#' +#' @param file string: file name +#' @param repo string: GH repository name in format "owner/repo". Default +#' `guess_repo()` tries to guess based on current working directory's git repo +#' @param tag string: tag for the GH release, defaults to "latest" +#' @param read_function function: used to read in the data, where the file is +#' passed as the first argument and any additional arguments are subsequently +#' passed in via `...`. Default `guess_read_function(file)` will check the file +#' extension and try to find an appropriate read function if the extension is one +#' of rds, csv, tsv, parquet, txt, or json, and will abort if not found. +#' @param ... additional arguments passed to `read_function` after file +#' @param .token GitHub authentication token, see [gh::gh_token()] +#' +#' @export +#' @family pb_rw +#' +#' @return Result of reading in the file in question. +#' @examples \donttest{ +#' try({ # try block is to avoid CRAN issues and is not required in ordinary usage +#' piggyback::pb_read("mtcars.tsv.gz", repo = "cboettig/piggyback-tests") +#' }) +#' } +pb_read <- function(file, + ..., + repo = guess_repo(), + tag = "latest", + read_function = guess_read_function(file), + .token = gh::gh_token()) { + stopifnot( + is.character(file) && length(file) == 1, + is.character(repo) && length(repo) == 1, + is.character(tag) && length(tag) == 1, + rlang::is_function(read_function) + ) + + on.exit(unlink(file.path(tempdir(), file))) + + pb_download( + file = file, + dest = tempdir(check = TRUE), + repo = repo, + tag = tag, + overwrite = TRUE, + .token = .token + ) + + read_function(file.path(tempdir(), file), ...) +} + +#' Guess read function from file extension +#' +#' This function accepts a filename and tries to return a valid function for +#' reading it. +#' +#' `guess_read_function` understands the following file extensions: +#' - rds with `readRDS` +#' - csv, csv.gz, csv.xz with `utils::read.csv` +#' - tsv, tsv.gz, tsv.xz with `utils::read.delim` +#' - parquet with `arrow::read_parquet` +#' - txt, txt.gz, txt.xz with `readLines` +#' - json, json.gz, json.xz with `jsonlite::fromJSON` +#' +#' @family pb_rw +#' @param file filename to parse +#' @return function for reading the file, if found +#' @keywords internal +guess_read_function <- function(file){ + file_ext <- tools::file_ext(gsub(x = file, pattern = ".gz$|.xz$", replacement = "")) + if (file_ext == "parquet") rlang::check_installed("arrow") + + read_fn <- switch( + file_ext, + "rds" = readRDS, + "csv" = utils::read.csv, + "tsv" = utils::read.delim, + "parquet" = arrow::read_parquet, + "txt" = readLines, + "json" = jsonlite::fromJSON, + cli::cli_abort("File type {.val {file_ext}} is not recognized, please provide a {.arg read_function}") + ) + + return(read_fn) +} diff --git a/R/pb_write.R b/R/pb_write.R new file mode 100644 index 0000000..10f4b1a --- /dev/null +++ b/R/pb_write.R @@ -0,0 +1,84 @@ +#' Write one object to repo/release +#' +#' A convenience wrapper around writing an object to a temporary file and then +#' uploading to a specified repo/release. +#' +#' @param x object: memory object to save to piggyback +#' @param file string: file name +#' @param ... additional arguments passed to `write_function` +#' @param repo string: GH repository name in format "owner/repo". Default +#' `guess_repo()` tries to guess based on current working directory's git repo +#' @param tag string: tag for the GH release, defaults to "latest" +#' @param write_function function: used to write an R object to file, where the +#' object is passed as the first argument, the filename as the second argument, +#' and any additional arguments are subsequently passed in via `...`. Default +#' `guess_write_function(file)` will check the file extension and try to find an +#' appropriate write function if the extension is one of rds, csv, tsv, parquet, +#' txt, or json, and will abort if not found. +#' @param .token GitHub authentication token, see [gh::gh_token()] +#' +#' @export +#' @family pb_rw +#' +#' @return Writes file to release and returns github API response +#' @examples \donttest{ +#' \dontshow{if (interactive()) \{} +#' pb_write(mtcars, "mtcars.rds", repo = "tanho63/piggyback-tests") +#' #> ℹ Uploading to latest release: "v0.0.2". +#' #> ℹ Uploading mtcars.rds ... +#' #> |===============================================================| 100% +#' \dontshow{\}} +#'} +pb_write <- function(x, + file, + ..., + repo = guess_repo(), + tag = "latest", + write_function = guess_write_function(file), + .token = gh::gh_token()) { + stopifnot( + is.character(file) && length(file) == 1, + is.character(repo) && length(repo) == 1, + is.character(tag) && length(tag) == 1, + rlang::is_function(write_function) + ) + destfile <- file.path(tempdir(check = TRUE), file) + on.exit(try(unlink(destfile))) + write_function(x, destfile, ...) + pb_upload(destfile, repo = repo, tag = tag, .token = .token) +} + +#' Guess write function from file extension +#' +#' This function accepts a filename and tries to return a valid function for +#' writing to it. +#' +#' `guess_write_function` understands the following file extensions: +#' - rds with `saveRDS` +#' - csv, csv.gz, csv.xz with `utils::write.csv` +#' - tsv, tsv.gz, tsv.xz with a modified `utils::write.csv` where sep is set to `"\t"` +#' - parquet with `arrow::write_parquet` +#' - txt, txt.gz, txt.xz with `writeLines` +#' - json, json.gz, json.xz with `jsonlite::write_json` +#' +#' @family pb_rw +#' @param file filename to parse +#' @return function for reading the file, if found +#' @keywords internal +guess_write_function <- function(file){ + file_ext <- tools::file_ext(gsub(x = file, pattern = ".gz$|.xz$", replacement = "")) + if (file_ext == "parquet") rlang::check_installed("arrow") + + write_fn <- switch( + file_ext, + "rds" = saveRDS, + "csv" = utils::write.csv, + "tsv" = function(x, file, ..., sep = "\t") utils::write.csv(x = x, file = file, sep = sep, ...), + "txt" = writeLines, + "parquet" = arrow::write_parquet, + "json" = jsonlite::write_json, + cli::cli_abort("File type {.val {file_ext}} is not recognized, please provide a {.arg write_function}") + ) + + return(write_fn) +} diff --git a/README.Rmd b/README.Rmd index d9dce35..1660ae9 100644 --- a/README.Rmd +++ b/README.Rmd @@ -7,7 +7,7 @@ output: github_document ```{r setup, include = FALSE} knitr::opts_chunk$set( - eval = TRUE, + eval = FALSE, collapse = TRUE, message = FALSE, comment = "#>", @@ -29,7 +29,7 @@ knitr::opts_chunk$set( [![DOI](http://joss.theoj.org/papers/10.21105/joss.00971/status.svg)](https://doi.org/10.21105/joss.00971) -`{piggyback}` provides an R interface for storing files as GitHub release assets, +`piggyback` provides an R interface for storing files as GitHub release assets, which is a convenient way for large/binary data files to _piggyback_ onto public and private GitHub repositories. This package includes functions for file downloads, uploads, and managing releases, which then are passed to the GitHub API. @@ -38,12 +38,12 @@ No authentication is required to download data from public repositories. ## Installation Install from CRAN via: -```r +```{r} install.packages("piggyback") ``` You can install the development version from [GitHub](https://github.com/ropensci/piggyback) with either r-universe or with remotes: -```r +```{r} install.packages("piggyback", repos = c('https://ropensci.r-universe.dev', getOption("repos"))) # install.packages("remotes") remotes::install_github("ropensci/piggyback") @@ -53,7 +53,7 @@ See [getting started vignette](https://docs.ropensci.org/piggyback/articles/intr for a more comprehensive introduction. Download data attached to a GitHub release: -```r +```{r} library(piggyback) pb_download("iris2.tsv.gz", repo = "cboettig/piggyback-tests", @@ -72,12 +72,12 @@ or a GITHUB_PAT environment variable - for more information, see the vignette no [authentication](https://docs.ropensci.org/piggyback/articles/piggyback.html#authentication). We can also upload data to a release. Start by creating a release: -```r +```{r} pb_release_create(repo = "cboettig/piggyback-tests", tag = "v0.0.2") #> ✔ Created new release "v0.0.2". ``` then upload to it: -```r +```{r} readr::write_tsv(mtcars, "mtcars.tsv.gz") pb_upload("mtcars.tsv.gz", repo = "cboettig/piggyback-tests") #> ℹ Uploading to latest release: "v0.0.2". @@ -117,8 +117,7 @@ Please note that this project is released with a [Contributor Code of Conduct](https://ropensci.org/code-of-conduct/). By participating in this project you agree to abide by its terms. -```{r include=FALSE} -unlink("*.gz") +```{r eval=TRUE, include=FALSE} codemeta::write_codemeta() ``` diff --git a/README.md b/README.md index b7ccec5..38ced78 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ Status](https://badges.ropensci.org/220_status.svg)](https://github.com/ropensci [![DOI](http://joss.theoj.org/papers/10.21105/joss.00971/status.svg)](https://doi.org/10.21105/joss.00971) -`{piggyback}` provides an R interface for storing files as GitHub -release assets, which is a convenient way for large/binary data files to +`piggyback` provides an R interface for storing files as GitHub release +assets, which is a convenient way for large/binary data files to *piggyback* onto public and private GitHub repositories. This package includes functions for file downloads, uploads, and managing releases, which then are passed to the GitHub API. diff --git a/codemeta.json b/codemeta.json index af1aeb7..80b2449 100644 --- a/codemeta.json +++ b/codemeta.json @@ -2,12 +2,12 @@ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "piggyback", - "description": "Because larger (> 50 MB) data files cannot easily be committed to git, a different approach is required to manage data associated with an analysis in a GitHub repository. This package provides a simple work-around by allowing larger (up to 2 GB) data files to piggyback on a repository as assets attached to individual GitHub releases. These files are not handled by git in any way, but instead are uploaded, downloaded, or edited directly by calls through the GitHub API. These data files can be versioned manually by creating different releases. This approach works equally well with public or private repositories. Data can be uploaded and downloaded programmatically from scripts. No authentication is required to download data from public repositories.", + "description": "Helps store files as GitHub release assets, which is a convenient way for large/binary data files to piggyback onto public and private GitHub repositories. Includes functions for file downloads, uploads, and managing releases via the GitHub API.", "name": "piggyback: Managing Larger Data on a GitHub Repository", "codeRepository": "https://github.com/ropensci/piggyback", "issueTracker": "https://github.com/ropensci/piggyback/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.1.5.9003", + "version": "0.1.5.9004", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", @@ -74,6 +74,18 @@ } ], "softwareSuggestions": [ + { + "@type": "SoftwareApplication", + "identifier": "arrow", + "name": "arrow", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=arrow" + }, { "@type": "SoftwareApplication", "identifier": "spelling", @@ -282,5 +294,5 @@ }, "SystemRequirements": null }, - "fileSize": "380.757KB" + "fileSize": "397.705KB" } diff --git a/docs/404.html b/docs/404.html deleted file mode 100644 index 076b8d3..0000000 --- a/docs/404.html +++ /dev/null @@ -1,125 +0,0 @@ - - - - - - - -Page not found (404) • piggyback - - - - - - - - - - - - - - - - - - -
-
- - - - -
-
- - -Content not found. Please use links in the navbar. - -
- - - -
- - - - -
- - - - - - - - diff --git a/docs/CODE_OF_CONDUCT.html b/docs/CODE_OF_CONDUCT.html deleted file mode 100644 index f1883e1..0000000 --- a/docs/CODE_OF_CONDUCT.html +++ /dev/null @@ -1,98 +0,0 @@ - -Contributor Code of Conduct • piggyback - - -
-
- - - -
-
- - -
- -

As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.

-

We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.

-

Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct.

-

Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team.

-

Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.

-

This Code of Conduct is adapted from the Contributor Covenant (http://contributor-covenant.org), version 1.0.0, available at http://contributor-covenant.org/version/1/0/0/

-
- -
- - - -
- - - -
- - - - - - - - diff --git a/docs/LICENSE.html b/docs/LICENSE.html deleted file mode 100644 index 2c3e160..0000000 --- a/docs/LICENSE.html +++ /dev/null @@ -1,285 +0,0 @@ - -GNU General Public License • piggyback - - -
-
- - - -
-
- - -
- -

Version 3, 29 June 2007
Copyright © 2007 Free Software Foundation, Inc. <http://fsf.org/>

-

Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.

-
-

Preamble

-

The GNU General Public License is a free, copyleft license for software and other kinds of works.

-

The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program–to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too.

-

When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things.

-

To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others.

-

For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights.

-

Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it.

-

For the developers’ and authors’ protection, the GPL clearly explains that there is no warranty for this free software. For both users’ and authors’ sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions.

-

Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users’ freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users.

-

Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free.

-

The precise terms and conditions for copying, distribution and modification follow.

-
-
-

TERMS AND CONDITIONS

-
-

0. Definitions

-

“This License” refers to version 3 of the GNU General Public License.

-

“Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.

-

“The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations.

-

To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work.

-

A “covered work” means either the unmodified Program or a work based on the Program.

-

To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well.

-

To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying.

-

An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion.

-
-
-

1. Source Code

-

The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work.

-

A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.

-

The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.

-

The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work’s System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work.

-

The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source.

-

The Corresponding Source for a work in source code form is that same work.

-
-
-

2. Basic Permissions

-

All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.

-

You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.

-

Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary.

-
-
- -

No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.

-

When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work’s users, your or third parties’ legal rights to forbid circumvention of technological measures.

-
-
-

4. Conveying Verbatim Copies

-

You may convey verbatim copies of the Program’s source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.

-

You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.

-
-
-

5. Conveying Modified Source Versions

-

You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:

-
  • -a) The work must carry prominent notices stating that you modified it, and giving a relevant date.
  • -
  • -b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to “keep intact all notices”.
  • -
  • -c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it.
  • -
  • -d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so.
  • -

A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation’s users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.

-
-
-

6. Conveying Non-Source Forms

-

You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:

-
  • -a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange.
  • -
  • -b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge.
  • -
  • -c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b.
  • -
  • -d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements.
  • -
  • -e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d.
  • -

A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work.

-

A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product.

-

“Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made.

-

If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM).

-

The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network.

-

Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying.

-
-
-

7. Additional Terms

-

“Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.

-

When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.

-

Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:

-
  • -a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or
  • -
  • -b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or
  • -
  • -c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or
  • -
  • -d) Limiting the use for publicity purposes of names of licensors or authors of the material; or
  • -
  • -e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or
  • -
  • -f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors.
  • -

All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying.

-

If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms.

-

Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way.

-
-
-

8. Termination

-

You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).

-

However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.

-

Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.

-

Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.

-
-
-

9. Acceptance Not Required for Having Copies

-

You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.

-
-
-

10. Automatic Licensing of Downstream Recipients

-

Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License.

-

An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party’s predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.

-

You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.

-
-
-

11. Patents

-

A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor’s “contributor version”.

-

A contributor’s “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.

-

Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor’s essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.

-

In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party.

-

If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient’s use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid.

-

If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it.

-

A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007.

-

Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law.

-
-
-

12. No Surrender of Others’ Freedom

-

If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.

-
-
-

13. Use with the GNU Affero General Public License

-

Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such.

-
-
-

14. Revised Versions of this License

-

The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.

-

Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation.

-

If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy’s public statement of acceptance of a version permanently authorizes you to choose that version for the Program.

-

Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.

-
-
-

15. Disclaimer of Warranty

-

THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

-
-
-

16. Limitation of Liability

-

IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.

-
-
-

17. Interpretation of Sections 15 and 16

-

If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.

-

END OF TERMS AND CONDITIONS

-
-
-
-

How to Apply These Terms to Your New Programs

-

If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms.

-

To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found.

-
<one line to give the program's name and a brief idea of what it does.>
-Copyright (C) 2018 Carl Boettiger
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-

Also add information on how to contact you by electronic and paper mail.

-

If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode:

-
piggyback Copyright (C) 2018 Carl Boettiger
-This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'.
-This is free software, and you are welcome to redistribute it
-under certain conditions; type 'show c' for details.
-

The hypothetical commands show w and show c should show the appropriate parts of the General Public License. Of course, your program’s commands might be different; for a GUI interface, you would use an “about box”.

-

You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see <http://www.gnu.org/licenses/>.

-

The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read <http://www.gnu.org/philosophy/why-not-lgpl.html>.

-
-
- -
- - - -
- - - -
- - - - - - - - diff --git a/docs/apple-touch-icon-120x120.png b/docs/apple-touch-icon-120x120.png deleted file mode 100644 index 2a5c876..0000000 Binary files a/docs/apple-touch-icon-120x120.png and /dev/null differ diff --git a/docs/apple-touch-icon-152x152.png b/docs/apple-touch-icon-152x152.png deleted file mode 100644 index fde0115..0000000 Binary files a/docs/apple-touch-icon-152x152.png and /dev/null differ diff --git a/docs/apple-touch-icon-180x180.png b/docs/apple-touch-icon-180x180.png deleted file mode 100644 index a7dafd3..0000000 Binary files a/docs/apple-touch-icon-180x180.png and /dev/null differ diff --git a/docs/apple-touch-icon-60x60.png b/docs/apple-touch-icon-60x60.png deleted file mode 100644 index f34a29e..0000000 Binary files a/docs/apple-touch-icon-60x60.png and /dev/null differ diff --git a/docs/apple-touch-icon-76x76.png b/docs/apple-touch-icon-76x76.png deleted file mode 100644 index f980333..0000000 Binary files a/docs/apple-touch-icon-76x76.png and /dev/null differ diff --git a/docs/apple-touch-icon.png b/docs/apple-touch-icon.png deleted file mode 100644 index faae37f..0000000 Binary files a/docs/apple-touch-icon.png and /dev/null differ diff --git a/docs/articles/alternatives.html b/docs/articles/alternatives.html deleted file mode 100644 index 42ae5bc..0000000 --- a/docs/articles/alternatives.html +++ /dev/null @@ -1,189 +0,0 @@ - - - - - - - -Piggyback comparison to alternatives • piggyback - - - - - - - - - - - - - - - - - - - -
-
- - - - -
-
- - - - -
-

-piggyback vs the alternatives -

-

There are many alternatives to piggyback, and after considerable experience I haven’t found any that ticked all the boxes for me:

-
    -
  • - -Free storage
  • -
  • - -Can be integrated into private code / private workflows
  • -
  • - -Simple and practical to deploy on continuous integration
  • -
  • - -Works well with private data
  • -
  • - -Minimal configuration
  • -
-
-

Git LFS -

-

Git LFS provides the closest user experience to what I was going for. It stands out above all other alternatives for providing both the best authentication experience (relying directly on any of the standard git authentication mechanisms such as https, ssh keys, app integration), and it provides the most legitimate version control of the data. However, there are many show-stoppers to using Git LFS for me.

-
    -
  • GitHub pricing & resulting problems for GitHub’s fork / PR model. Described eloquently here. Basically, despite generous rates and free data options everywhere else, GitHub’s LFS storage and bandwidth not only cost a lot, but also make it impossible to have public forks and pull request for your repository. Technically this is a problem only for GitHub’s LFS (since it stems from the pricing rules); and can be avoided by using LFS on GitLab or other platform, as Jim Hester has described. Still, this proved unsuccessful for me, and still faces the other big issue with git-lfs:

  • -
  • Overwrites git itself. Git LFS is just too integrated into git – it replaces your authentic git engine with git-lfs, such that the identical git command can have different behaviors on a machine with git-lfs installed vs just plain git. Maybe fine for a professional team that is “all in” on git-lfs, but is a constant source of pitfalls when working with students and moving between machines that all have only authentic git installed. The difficulties with supporting pull requests etc are also related to this – in some sense, once you have a git-lfs repository, you’re really using an entirely new version control system that isn’t going to be 100% compatible with the nearly-ubiquitous authentic git.

  • -
-
-
-

Amazon S3 -

-

Amazon S3 is perhaps the most universal and most obvious go-to place for online-available public and private data storage. The 5 GB/mo free tier is nice and the pricing is very reasonable and only very incremental after that. It is easily the most industry-standard solution, and still probably the best way to go in many cases. It is probably the most scalable solution for very large data, and the only such that has built in support/integration to larger query services like Apache Spark / sparklyr. It falls short of my own use case though in the authentication area. I require students create a GitHub account for my courses and my lab group. I don’t like requiring such third-party accounts, but this one is fundamental to our daily use in classroom and in research, and most of them will continue using the service afterwards. I particularly don’t like having people create complex accounts that they might not even use much in the class or afterwards, just to deal with some pesky minor issue of some data file that is just a little too big for GitHub.

-

Amazon’s authentication is also much more complex than GitHub’s passwords or tokens, as is the process of uploading and downloading data from S3 (though the aws.s3 R package is rather nice remedy here, it doesn’t conform to the same user API as the aws-cli (python) tool, leaving some odd quirks and patterns that don’t match standard Linux commands.) Together, these make it significantly more difficult to deploy as a quick solution for moving private data around with private repositories.

-
-
-

Scientific repositories with private storage -

-

For scientific research purposes, this would be my ideal solution. Encouraging researchers to submit data to a repository at the time of publication is always a challenge, since doing so inevitably involves time & effort and the immediate benefit to the researcher is relatively minimal. If uploading the data to a repository served an immediate practical purpose of facilitating collaboration, backing up and possibly versioning data, etc, during the research process itself rather than after all is said and done, it would be much more compelling. Several repositories permit sharing of private data, at least up to some threshold, including DataONE and figshare. Unfortunately, at this time, I have found the interfaces and R tooling for these too limited or cumbersome for everyday use.

-
-
-

-datastorr -

-

The piggyback approach is partly inspired by the strategy used in the datastorr package, which also uploads data to GitHub releases. datastorr envisions a rather different workflow around this storage strategy, based on the concept of an R “data package” rather than the Git LFS. I am not a fan of the “data package” approach in general – I think data should be stored in a platform agnostic way, not as .Rdata files, and I often want to first download my data to disk and read it with dedicated functions, not load it “auto-magically” as a package. This latter issue is particularly important when the data files are larger than what can conveniently fit into working memory, and is better accessed as a database (e.g. SQLite for tabular data, postgis spatial data, etc).

-

In terms of practical implementation, datastorr also creates a new release every time the data file is updated, rather than letting you overwrite files. In principle piggyback will let you version data this way as well, simply create a new release first using pb_new_release(tag="v2") or whatever tag you like. I have not opted for this workflow since in reality, versioning data with releases this way is technically equivalent to creating a new folder for each new version of the data and storing that – unlike true git commits, release assets such as datastorr creates can be easily deleted or overwritten. I still believe permanent versioned archives like Zenodo should be used for long-term versioned distribution. Meanwhile, for day-to-day use I often want to overwrite data files with their most recent versions. (In my case these ‘data’ files are most often created from upstream data and/or other possibly-long-running code, and are tracked for convenience. As such they often change as a result of continued work on the upstream processing code. Perhaps this is not the case for many users and more attention should be paid to versioning.)

-
-
-

Sharding on GitHub -

-

Another creative solution (hack), at least for some file types, is to break large files into multiple smaller files, and commit those to one or many GitHub repositories. While sharding is sometimes a legitimate strategy, it has many obvious practical disadvantages and limitations.

-
-
-
- - - -
- - - - -
- - - - - - - - diff --git a/docs/articles/alternatives_files/header-attrs-2.10/header-attrs.js b/docs/articles/alternatives_files/header-attrs-2.10/header-attrs.js deleted file mode 100644 index dd57d92..0000000 --- a/docs/articles/alternatives_files/header-attrs-2.10/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/articles/index.html b/docs/articles/index.html deleted file mode 100644 index d547ef7..0000000 --- a/docs/articles/index.html +++ /dev/null @@ -1,91 +0,0 @@ - -Articles • piggyback - - -
-
- - - -
-
- - - -
-
- - -
- - - - - - - - diff --git a/docs/articles/intro.html b/docs/articles/intro.html deleted file mode 100644 index 7202eae..0000000 --- a/docs/articles/intro.html +++ /dev/null @@ -1,256 +0,0 @@ - - - - - - - -Piggyback Data atop your GitHub Repository! • piggyback - - - - - - - - - - - - - - - - - - - -
-
- - - - -
-
- - - - -
-

Why piggyback? -

-

piggyback grew out of the needs of students both in my classroom and in my research group, who frequently need to work with data files somewhat larger than one can conveniently manage by committing directly to GitHub. As we frequently want to share and run code that depends on >50MB data files on each of our own machines, on continuous integration, and on larger computational servers, data sharing quickly becomes a bottleneck.

-

GitHub allows repositories to attach files of up to 2 GB each to releases as a way to distribute large files associated with the project source code. There is no limit on the number of files or bandwidth to deliver them.

-
-

Installation -

-

Install the latest release from CRAN using:

-
-install.packages("piggyback")
-

You can install the development version from GitHub with:

-
-# install.packages("devtools")
-devtools::install_github("ropensci/piggyback")
-
-
-

Authentication -

-

No authentication is required to download data from public GitHub repositories using piggyback. Nevertheless, piggyback recommends setting a token when possible to avoid rate limits. To upload data to any repository, or to download data from private repositories, you will need to authenticate first.

-

To do so, add your GitHub Token to an environmental variable, e.g. in a .Renviron file in your home directory or project directory (any private place you won’t upload), see usethis::edit_r_environ(). For one-off use you can also set your token from the R console using:

-
-Sys.setenv(GITHUB_PAT="xxxxxx")
-

But try to avoid putting Sys.setenv() in any R scripts – remember, the goal here is to avoid writing your private token in any file that might be shared, even privately.

-

For more information, please see the usethis guide to GitHub credentials

-
-
-

Downloading data -

-

Download the latest version or a specific version of the data:

- -
-pb_download("iris2.tsv.gz",
-            repo = "cboettig/piggyback-tests",
-            tag = "v0.0.1",
-            dest = tempdir())
-

Note: Whenever you are working from a location inside a git repository corresponding to your GitHub repo, you can simply omit the repo argument and it will be detected automatically. Likewise, if you omit the release tag, thepb_downloadwill simply pull data from most recent release (latest). Third, you can omittempdir()if you are using an RStudio Project (.Rprojfile) in your repository, and then the download location will be relative to Project root.tempdir()` is used throughout the examples only to meet CRAN policies and is unlikely to be the choice you actually want here.

-

Lastly, simply omit the file name to download all assets connected with a given release.

-
-pb_download(repo = "cboettig/piggyback-tests",
-            tag = "v0.0.1",
-            dest = tempdir())
-

These defaults mean that in most cases, it is sufficient to simply call pb_download() without additional arguments to pull in any data associated with a project on a GitHub repo that is too large to commit to git directly.

-

pb_download() will skip the download of any file that already exists locally if the timestamp on the local copy is more recent than the timestamp on the GitHub copy. pb_download() also includes arguments to control the timestamp behavior, progress bar, whether existing files should be overwritten, or if any particular files should not be downloaded. See function documentation for details.

-

Sometimes it is preferable to have a URL from which the data can be read in directly, rather than downloading the data to a local file. For example, such a URL can be embedded directly into another R script, avoiding any dependence on piggyback (provided the repository is already public.) To get a list of URLs rather than actually downloading the files, use pb_download_url():

-
-pb_download_url("data/mtcars.tsv.gz",
-                repo = "cboettig/piggyback-tests",
-                tag = "v0.0.1") 
-
-
-

Uploading data -

-

If your GitHub repository doesn’t have any releases yet, piggyback will help you quickly create one. Create new releases to manage multiple versions of a given data file. While you can create releases as often as you like, making a new release is by no means necessary each time you upload a file. If maintaining old versions of the data is not useful, you can stick with a single release and upload all of your data there.

-
-pb_new_release("cboettig/piggyback-tests", "v0.0.2")
-

Once we have at least one release available, we are ready to upload. By default, pb_upload will attach data to the latest release.

-
-## We'll need some example data first.
-## Pro tip: compress your tabular data to save space & speed upload/downloads
-readr::write_tsv(mtcars, "mtcars.tsv.gz")
-
-pb_upload("mtcars.tsv.gz",
-          repo = "cboettig/piggyback-tests",
-          tag = "v0.0.1")
-

Like pb_download(), pb_upload() will overwrite any file of the same name already attached to the release file by default, unless the timestamp the previously uploaded version is more recent. You can toggle these settings with overwrite=FALSE and use_timestamps=FALSE.

-
-
-

Additional convenience functions -

-

List all files currently piggybacking on a given release. Omit the tag to see files on all releases.

-
-pb_list(repo = "cboettig/piggyback-tests",
-        tag = "v0.0.1")
-

Delete a file from a release:

-
-pb_delete(file = "mtcars.tsv.gz",
-          repo = "cboettig/piggyback-tests",
-          tag = "v0.0.1")
-

Note that this is irreversible unless you have a copy of the data elsewhere.

-
-
-

Multiple files -

-

You can pass in a vector of file paths with something like list.files() to the file argument of pb_upload() in order to upload multiple files. Some common patterns:

-
-library(magrittr)
-
-## upload a folder of data
-list.files("data") %>%
-  pb_upload(repo = "cboettig/piggyback-tests", tag = "v0.0.1")
-
-## upload certain file extensions
-list.files(pattern = c("*.tsv.gz", "*.tif", "*.zip")) %>%
-  pb_upload(repo = "cboettig/piggyback-tests", tag = "v0.0.1")
-

Similarly, you can download all current data assets of the latest or specified release by using pb_download() with no arguments.

-
-
-

Caching -

-

To reduce API calls to GitHub, piggyback caches most calls with a timeout of 1 second by default. This avoids repeating identical requests to update it’s internal record of the repository data (releases, assets, timestamps, etc) during programmatic use. You can increase or decrease this delay by setting the environmental variable in seconds, e.g. Sys.setenv("piggyback_cache_duration"=10) for a longer delay or Sys.setenv("piggyback_cache_duration"=0) to disable caching.

-
-
-

Valid file names -

-

GitHub assets attached to a release do not support file paths, and will convert most special characters (#, %, etc) to . or throw an error (e.g. for file names containing $, @, /). piggyback will default to using the base name of the file only (i.e. will only use "mtcars.csv" if provided a file path like "data/mtcars.csv")

-
-
-

A Note on GitHub Releases vs Data Archiving -

-

piggyback is not intended as a data archiving solution. Importantly, bear in mind that there is nothing special about multiple “versions” in releases, as far as data assets uploaded by piggyback are concerned. The data files piggyback attaches to a Release can be deleted or modified at any time – creating a new release to store data assets is the functional equivalent of just creating new directories v0.1, v0.2 to store your data. (GitHub Releases are always pinned to a particular git tag, so the code/git-managed contents associated with repo are more immutable, but remember our data assets just piggyback on top of the repo).

-

Permanent, published data should always be archived in a proper data repository with a DOI, such as zenodo.org. Zenodo can freely archive public research data files up to 50 GB in size, and data is strictly versioned (once released, a DOI always refers to the same version of the data, new releases are given new DOIs). piggyback is meant only to lower the friction of working with data during the research process. (e.g. provide data accessible to collaborators or continuous integration systems during research process, including for private repositories.)

-
-
-

What will GitHub think of this? -

-

GitHub documentation at the time of writing endorses the use of attachments to releases as a solution for distributing large files as part of your project:

-

-

Of course, it will be up to GitHub to decide if this use of release attachments is acceptable in the long term.

-
-
-
- - - -
- - - - -
- - - - - - - - diff --git a/docs/articles/intro_files/header-attrs-2.10/header-attrs.js b/docs/articles/intro_files/header-attrs-2.10/header-attrs.js deleted file mode 100644 index dd57d92..0000000 --- a/docs/articles/intro_files/header-attrs-2.10/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/authors.html b/docs/authors.html deleted file mode 100644 index e6dc9b7..0000000 --- a/docs/authors.html +++ /dev/null @@ -1,123 +0,0 @@ - -Authors and Citation • piggyback - - -
-
- - - -
-
-
- - - -
  • -

    Carl Boettiger. Author, maintainer, copyright holder. -

    -
  • -
  • -

    Mark Padgham. Contributor. -

    -
  • -
  • -

    Jeffrey O Hanson. Contributor. -

    -
  • -
  • -

    Kevin Kuo. Contributor. -

    -
  • -
-
-
-

Citation

- Source: DESCRIPTION -
-
- - -

Boettiger C (2022). -piggyback: Managing Larger Data on a GitHub Repository. -R package version 0.1.1.9001, https://github.com/ropensci/piggyback. -

-
@Manual{,
-  title = {piggyback: Managing Larger Data on a GitHub Repository},
-  author = {Carl Boettiger},
-  year = {2022},
-  note = {R package version 0.1.1.9001},
-  url = {https://github.com/ropensci/piggyback},
-}
- -
- -
- - - -
- - - - - - - - diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css deleted file mode 100644 index 5a85941..0000000 --- a/docs/bootstrap-toc.css +++ /dev/null @@ -1,60 +0,0 @@ -/*! - * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) - * Copyright 2015 Aidan Feldman - * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ - -/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ - -/* All levels of nav */ -nav[data-toggle='toc'] .nav > li > a { - display: block; - padding: 4px 20px; - font-size: 13px; - font-weight: 500; - color: #767676; -} -nav[data-toggle='toc'] .nav > li > a:hover, -nav[data-toggle='toc'] .nav > li > a:focus { - padding-left: 19px; - color: #563d7c; - text-decoration: none; - background-color: transparent; - border-left: 1px solid #563d7c; -} -nav[data-toggle='toc'] .nav > .active > a, -nav[data-toggle='toc'] .nav > .active:hover > a, -nav[data-toggle='toc'] .nav > .active:focus > a { - padding-left: 18px; - font-weight: bold; - color: #563d7c; - background-color: transparent; - border-left: 2px solid #563d7c; -} - -/* Nav: second level (shown on .active) */ -nav[data-toggle='toc'] .nav .nav { - display: none; /* Hide by default, but at >768px, show it */ - padding-bottom: 10px; -} -nav[data-toggle='toc'] .nav .nav > li > a { - padding-top: 1px; - padding-bottom: 1px; - padding-left: 30px; - font-size: 12px; - font-weight: normal; -} -nav[data-toggle='toc'] .nav .nav > li > a:hover, -nav[data-toggle='toc'] .nav .nav > li > a:focus { - padding-left: 29px; -} -nav[data-toggle='toc'] .nav .nav > .active > a, -nav[data-toggle='toc'] .nav .nav > .active:hover > a, -nav[data-toggle='toc'] .nav .nav > .active:focus > a { - padding-left: 28px; - font-weight: 500; -} - -/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ -nav[data-toggle='toc'] .nav > .active > ul { - display: block; -} diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js deleted file mode 100644 index 1cdd573..0000000 --- a/docs/bootstrap-toc.js +++ /dev/null @@ -1,159 +0,0 @@ -/*! - * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) - * Copyright 2015 Aidan Feldman - * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ -(function() { - 'use strict'; - - window.Toc = { - helpers: { - // return all matching elements in the set, or their descendants - findOrFilter: function($el, selector) { - // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ - // http://stackoverflow.com/a/12731439/358804 - var $descendants = $el.find(selector); - return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); - }, - - generateUniqueIdBase: function(el) { - var text = $(el).text(); - var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); - return anchor || el.tagName.toLowerCase(); - }, - - generateUniqueId: function(el) { - var anchorBase = this.generateUniqueIdBase(el); - for (var i = 0; ; i++) { - var anchor = anchorBase; - if (i > 0) { - // add suffix - anchor += '-' + i; - } - // check if ID already exists - if (!document.getElementById(anchor)) { - return anchor; - } - } - }, - - generateAnchor: function(el) { - if (el.id) { - return el.id; - } else { - var anchor = this.generateUniqueId(el); - el.id = anchor; - return anchor; - } - }, - - createNavList: function() { - return $(''); - }, - - createChildNavList: function($parent) { - var $childList = this.createNavList(); - $parent.append($childList); - return $childList; - }, - - generateNavEl: function(anchor, text) { - var $a = $(''); - $a.attr('href', '#' + anchor); - $a.text(text); - var $li = $('
  • '); - $li.append($a); - return $li; - }, - - generateNavItem: function(headingEl) { - var anchor = this.generateAnchor(headingEl); - var $heading = $(headingEl); - var text = $heading.data('toc-text') || $heading.text(); - return this.generateNavEl(anchor, text); - }, - - // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). - getTopLevel: function($scope) { - for (var i = 1; i <= 6; i++) { - var $headings = this.findOrFilter($scope, 'h' + i); - if ($headings.length > 1) { - return i; - } - } - - return 1; - }, - - // returns the elements for the top level, and the next below it - getHeadings: function($scope, topLevel) { - var topSelector = 'h' + topLevel; - - var secondaryLevel = topLevel + 1; - var secondarySelector = 'h' + secondaryLevel; - - return this.findOrFilter($scope, topSelector + ',' + secondarySelector); - }, - - getNavLevel: function(el) { - return parseInt(el.tagName.charAt(1), 10); - }, - - populateNav: function($topContext, topLevel, $headings) { - var $context = $topContext; - var $prevNav; - - var helpers = this; - $headings.each(function(i, el) { - var $newNav = helpers.generateNavItem(el); - var navLevel = helpers.getNavLevel(el); - - // determine the proper $context - if (navLevel === topLevel) { - // use top level - $context = $topContext; - } else if ($prevNav && $context === $topContext) { - // create a new level of the tree and switch to it - $context = helpers.createChildNavList($prevNav); - } // else use the current $context - - $context.append($newNav); - - $prevNav = $newNav; - }); - }, - - parseOps: function(arg) { - var opts; - if (arg.jquery) { - opts = { - $nav: arg - }; - } else { - opts = arg; - } - opts.$scope = opts.$scope || $(document.body); - return opts; - } - }, - - // accepts a jQuery object, or an options object - init: function(opts) { - opts = this.helpers.parseOps(opts); - - // ensure that the data attribute is in place for styling - opts.$nav.attr('data-toggle', 'toc'); - - var $topContext = this.helpers.createChildNavList(opts.$nav); - var topLevel = this.helpers.getTopLevel(opts.$scope); - var $headings = this.helpers.getHeadings(opts.$scope, topLevel); - this.helpers.populateNav($topContext, topLevel, $headings); - } - }; - - $(function() { - $('nav[data-toggle="toc"]').each(function(i, el) { - var $nav = $(el); - Toc.init($nav); - }); - }); -})(); diff --git a/docs/docsearch.css b/docs/docsearch.css deleted file mode 100644 index e5f1fe1..0000000 --- a/docs/docsearch.css +++ /dev/null @@ -1,148 +0,0 @@ -/* Docsearch -------------------------------------------------------------- */ -/* - Source: https://github.com/algolia/docsearch/ - License: MIT -*/ - -.algolia-autocomplete { - display: block; - -webkit-box-flex: 1; - -ms-flex: 1; - flex: 1 -} - -.algolia-autocomplete .ds-dropdown-menu { - width: 100%; - min-width: none; - max-width: none; - padding: .75rem 0; - background-color: #fff; - background-clip: padding-box; - border: 1px solid rgba(0, 0, 0, .1); - box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); -} - -@media (min-width:768px) { - .algolia-autocomplete .ds-dropdown-menu { - width: 175% - } -} - -.algolia-autocomplete .ds-dropdown-menu::before { - display: none -} - -.algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { - padding: 0; - background-color: rgb(255,255,255); - border: 0; - max-height: 80vh; -} - -.algolia-autocomplete .ds-dropdown-menu .ds-suggestions { - margin-top: 0 -} - -.algolia-autocomplete .algolia-docsearch-suggestion { - padding: 0; - overflow: visible -} - -.algolia-autocomplete .algolia-docsearch-suggestion--category-header { - padding: .125rem 1rem; - margin-top: 0; - font-size: 1.3em; - font-weight: 500; - color: #00008B; - border-bottom: 0 -} - -.algolia-autocomplete .algolia-docsearch-suggestion--wrapper { - float: none; - padding-top: 0 -} - -.algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { - float: none; - width: auto; - padding: 0; - text-align: left -} - -.algolia-autocomplete .algolia-docsearch-suggestion--content { - float: none; - width: auto; - padding: 0 -} - -.algolia-autocomplete .algolia-docsearch-suggestion--content::before { - display: none -} - -.algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { - padding-top: .75rem; - margin-top: .75rem; - border-top: 1px solid rgba(0, 0, 0, .1) -} - -.algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { - display: block; - padding: .1rem 1rem; - margin-bottom: 0.1; - font-size: 1.0em; - font-weight: 400 - /* display: none */ -} - -.algolia-autocomplete .algolia-docsearch-suggestion--title { - display: block; - padding: .25rem 1rem; - margin-bottom: 0; - font-size: 0.9em; - font-weight: 400 -} - -.algolia-autocomplete .algolia-docsearch-suggestion--text { - padding: 0 1rem .5rem; - margin-top: -.25rem; - font-size: 0.8em; - font-weight: 400; - line-height: 1.25 -} - -.algolia-autocomplete .algolia-docsearch-footer { - width: 110px; - height: 20px; - z-index: 3; - margin-top: 10.66667px; - float: right; - font-size: 0; - line-height: 0; -} - -.algolia-autocomplete .algolia-docsearch-footer--logo { - background-image: url("data:image/svg+xml;utf8,"); - background-repeat: no-repeat; - background-position: 50%; - background-size: 100%; - overflow: hidden; - text-indent: -9000px; - width: 100%; - height: 100%; - display: block; - transform: translate(-8px); -} - -.algolia-autocomplete .algolia-docsearch-suggestion--highlight { - color: #FF8C00; - background: rgba(232, 189, 54, 0.1) -} - - -.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { - box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) -} - -.algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { - background-color: rgba(192, 192, 192, .15) -} diff --git a/docs/docsearch.js b/docs/docsearch.js deleted file mode 100644 index b35504c..0000000 --- a/docs/docsearch.js +++ /dev/null @@ -1,85 +0,0 @@ -$(function() { - - // register a handler to move the focus to the search bar - // upon pressing shift + "/" (i.e. "?") - $(document).on('keydown', function(e) { - if (e.shiftKey && e.keyCode == 191) { - e.preventDefault(); - $("#search-input").focus(); - } - }); - - $(document).ready(function() { - // do keyword highlighting - /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ - var mark = function() { - - var referrer = document.URL ; - var paramKey = "q" ; - - if (referrer.indexOf("?") !== -1) { - var qs = referrer.substr(referrer.indexOf('?') + 1); - var qs_noanchor = qs.split('#')[0]; - var qsa = qs_noanchor.split('&'); - var keyword = ""; - - for (var i = 0; i < qsa.length; i++) { - var currentParam = qsa[i].split('='); - - if (currentParam.length !== 2) { - continue; - } - - if (currentParam[0] == paramKey) { - keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); - } - } - - if (keyword !== "") { - $(".contents").unmark({ - done: function() { - $(".contents").mark(keyword); - } - }); - } - } - }; - - mark(); - }); -}); - -/* Search term highlighting ------------------------------*/ - -function matchedWords(hit) { - var words = []; - - var hierarchy = hit._highlightResult.hierarchy; - // loop to fetch from lvl0, lvl1, etc. - for (var idx in hierarchy) { - words = words.concat(hierarchy[idx].matchedWords); - } - - var content = hit._highlightResult.content; - if (content) { - words = words.concat(content.matchedWords); - } - - // return unique words - var words_uniq = [...new Set(words)]; - return words_uniq; -} - -function updateHitURL(hit) { - - var words = matchedWords(hit); - var url = ""; - - if (hit.anchor) { - url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; - } else { - url = hit.url + '?q=' + escape(words.join(" ")); - } - - return url; -} diff --git a/docs/favicon-16x16.png b/docs/favicon-16x16.png deleted file mode 100644 index 0c21089..0000000 Binary files a/docs/favicon-16x16.png and /dev/null differ diff --git a/docs/favicon-32x32.png b/docs/favicon-32x32.png deleted file mode 100644 index 3b61c02..0000000 Binary files a/docs/favicon-32x32.png and /dev/null differ diff --git a/docs/favicon.ico b/docs/favicon.ico deleted file mode 100644 index 1d84ff7..0000000 Binary files a/docs/favicon.ico and /dev/null differ diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index d579e91..0000000 --- a/docs/index.html +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - -Managing Larger Data on a GitHub Repository • piggyback - - - - - - - - - - - - - - - - - - - -
    -
    - - - - -
    -
    -
    - - - -

    Because larger (> 50 MB) data files cannot easily be committed to git, a different approach is required to manage data associated with an analysis in a GitHub repository. This package provides a simple work-around by allowing larger (up to 2 GB per file) data files to piggyback on a repository as assets attached to individual GitHub releases. These files are not handled by git in any way, but instead are uploaded, downloaded, or edited directly by calls through the GitHub API. These data files can be versioned manually by creating different releases. This approach works equally well with public or private repositories. Data can be uploaded and downloaded programmatically from scripts. No authentication is required to download data from public repositories.

    -
    -

    Installation -

    -

    Install from CRAN via

    -
    -install.packages("piggyback")
    -

    You can install the development version from GitHub with:

    -
    -# install.packages("devtools")
    -devtools::install_github("ropensci/piggyback")
    -
    -
    -

    Quickstart -

    -

    See the piggyback vignette for details on authentication and additional package functionality.

    -

    Piggyback can download data attached to a release on any repository:

    -
    -library(piggyback)
    -pb_download("iris.tsv.gz", repo = "cboettig/piggyback-tests", dest = tempdir())
    -#> Warning in pb_download("iris.tsv.gz", repo = "cboettig/piggyback-tests", :
    -#> file(s) iris.tsv.gz not found in repo cboettig/piggyback-tests
    -

    Downloading from private repos or uploading to any repo requires authentication, so be sure to set a GITHUB_TOKEN (or GITHUB_PAT) environmental variable, or include the .token argument. Omit the file name to download all attached objects. Omit the repository name to default to the current repository. See introductory vignette or function documentation for details.

    -

    We can also upload data to any existing release (defaults to latest):

    -
    -## We'll need some example data first.
    -## Pro tip: compress your tabular data to save space & speed upload/downloads
    -readr::write_tsv(mtcars, "mtcars.tsv.gz")
    -
    -pb_upload("mtcars.tsv.gz", repo = "cboettig/piggyback-tests")
    -
    -
    -

    Git LFS and other alternatives -

    -

    piggyback acts like a poor soul’s Git LFS. Git LFS is not only expensive, it also breaks GitHub’s collaborative model – basically if someone wants to submit a PR with a simple edit to your docs, they cannot fork your repository since that would otherwise count against your Git LFS storage. Unlike Git LFS, piggyback doesn’t take over your standard git client, it just perches comfortably on the shoulders of your existing GitHub API. Data can be versioned by piggyback, but relative to git LFS versioning is less strict: uploads can be set as a new version or allowed to overwrite previously uploaded data.

    -
    -
    -

    But what will GitHub think of this? -

    -

    GitHub documentation at the time of writing endorses the use of attachments to releases as a solution for distributing large files as part of your project:

    -

    -

    Of course, it will be up to GitHub to decide if this use of release attachments is acceptable in the long term.

    - -

    Also see our vignette comparing alternatives.

    -
    -

    Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.

    -

    ropensci_footer

    -
    -
    -
    - - -
    - - -
    - -
    -

    -

    Site built with pkgdown 2.0.1.

    -
    - -
    -
    - - - - - - - - diff --git a/docs/jquery.sticky-kit.min.js b/docs/jquery.sticky-kit.min.js deleted file mode 100644 index 1c16271..0000000 --- a/docs/jquery.sticky-kit.min.js +++ /dev/null @@ -1,11 +0,0 @@ -/* Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | */ -/* - Source: https://github.com/leafo/sticky-kit - License: MIT -*/ -(function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k)); -if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
    "))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q, -u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}), -a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize", -y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n - - - - - diff --git a/docs/logo.png b/docs/logo.png deleted file mode 100644 index 0262138..0000000 Binary files a/docs/logo.png and /dev/null differ diff --git a/docs/logo.svg b/docs/logo.svg deleted file mode 100644 index 462fc18..0000000 --- a/docs/logo.svg +++ /dev/null @@ -1,295 +0,0 @@ - - - - - - image/svg+xml - - RStudio_Hex 2016 v7 outlines - - - - - - - - - - - RStudio_Hex 2016 v7 outlines - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/news/index.html b/docs/news/index.html deleted file mode 100644 index db68813..0000000 --- a/docs/news/index.html +++ /dev/null @@ -1,159 +0,0 @@ - -Changelog • piggyback - - -
    -
    - - - -
    -
    - - -
    - -
    • update intro vignette to remove all mentions of pb_track(), pb_push(), and pb_pull() which were removed as of version 0.0.0.9900
    • -
    • -pb_upload() now handles the dir argument to control relative path directories.
    • -
    • update intro vignette to remove mention of path name handling and instead provide examples of how path names are handled.
    • -
    • update intro vignette instructions for git authentication
    • -
    -
    - -
    • switch to gh::gh_token() for token management. Still supports the same env var approach, but also compatible with gitcreds and other use.
    • -
    • resolve issue in pb_upload() when creating a new tag in the process, previously data would be attached to the previously latest tag instead of the newly created one.
    • -
    • resolve issue in pb_download() where httr would report a 401 status even after data successfully downloads.
    • -
    -
    - -
    • address remaining authentication issue in changes to GitHub API (on pb_upload()) [#47]
    • -
    • Use flat file structure on upload/download instead of encoding path [#48]
    • -
    • improve performance via more aggressive memoising of pb_info() calls, inceasing default piggyback_cache_duration to 10 minutes [#46]
    • -
    • Resolve bug introduced by API changes that would stop creation of tags on repos with default branch called main or without previous releases [#48]
    • -
    -
    - -
    • address issues in authentication due to changes in GitHub API (#37)
    • -
    -
    - -
    • -guess_repo() now infers a remote when there are multiple associated with the repo. The “upstream” (preferred) or “origin” repo is selected if either exists, otherwise the function errors and asks the user to explicitly specify a repo (#31).
    • -
    • -release_info() now works properly when there are no existing releases, which enables the usage of pb_new_release() on repos without a release (#29).
    • -
    • Fix error on pb_info() under certain cases which resulted in Error in a[[1]] : subscript out of bounds, (#36)
    • -
    • Fix CRAN unit-test on deleting file
    • -
    -
    - -
    • Improve interface regarding overwrite behavior in pb_upload() (#25)
    • -
    • Bugfixes for errors introduced in 0.0.9: -
      • Access all assets on a release instead of first 30. This could break upload and download. (#23, #24)
      • -
      • Uploading of directory paths could cause download errors in pb_download(). (#24, #26)
      • -
    • -
    -
    - -
    • Enable re-upload and deletion of partially uploaded files (#19)
    • -
    -
    - -
    • Updates to documentation, streamlining tests
    • -
    • remove dependency on utils::askYesNo which is only available in R >= 3.5.0
    • -
    -
    - -
    • Initial release to CRAN
    • -

    -
    - -
    • bugfix for migrating unit test
    • -
    -
    - -
    • bugfix for migrating unit test, JOSS submission
    • -
    -
    - -
    • initial Onboarding to rOpenSci
    • -
    -
    - -
    • Added a NEWS.md file to track changes to the package.
    • -
    -
    - - - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/onboarding-submission.html b/docs/onboarding-submission.html deleted file mode 100644 index e0698a1..0000000 --- a/docs/onboarding-submission.html +++ /dev/null @@ -1,260 +0,0 @@ - - - - - - - - -NA • piggyback - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - - -
    -

    -Summary

    -
      -
    • Allow large and binary data files to “piggyback” on top of your existing repositories. push and pull large-ish (< 2GB) data files to & from GitHub repositories as attachments to a GitHub release;

    • -
    • Paste the full DESCRIPTION file inside a code block below:

    • -
    -
    Package: piggyback
    -Version: 0.0.0.9000
    -Title: Managing Larger Data on a GitHub Repository
    -Description: Because larger (> 50 MB) data files cannot easily be committed to git,
    -  a different approach is required to manage data associated with an analysis in a
    -  GitHub repository.  This package provides a simple work-around by allowing larger
    -  (up to 2 GB) data files to piggyback on a repository as assets attached to individual
    -  GitHub releases.  These files are not handled by git in any way, but instead are
    -  uploaded, downloaded, or edited directly by calls through the GitHub API. These
    -  data files can be versioned manually by creating different releases.  This approach
    -  works equally well with public or private repositories.  Data can be uploaded
    -  and downloaded programmatically from scripts. No authentication is required to
    -  download data from public repositories.
    -Authors@R: person("Carl", "Boettiger",
    -                  email = "cboettig@gmail.com",
    -                  role = c("aut", "cre", "cph"),
    -                  comment=c(ORCID = "0000-0002-1642-628X"))
    -URL: https://github.com/cboettig/piggyback
    -BugReports: https://github.com/cboettig/piggyback/issues
    -License: GPL-3
    -Encoding: UTF-8
    -LazyData: true
    -ByteCompile: true
    -Imports:
    -    gh,
    -    httr,
    -    jsonlite,
    -    git2r,
    -    fs,
    -    usethis,
    -    crayon,
    -    clisymbols
    -Suggests:
    -    readr,
    -    covr,
    -    testthat,
    -    datasets,
    -    knitr,
    -    rmarkdown
    -VignetteBuilder: knitr
    -RoxygenNote: 6.0.1.9000
    -Roxygen: list(markdown = TRUE)
    -
    -
      -
    • URL for the package (the development repository, not a stylized html page):
    • -
    -

    https://github.com/cboettig/piggyback

    -
      -
    • Please indicate which category or categories from our package fit policies this package falls under *and why(? (e.g., data retrieval, reproducibility. If you are unsure, we suggest you make a pre-submission inquiry.):
    • -
    -

    reproducibility, because accessing data being analyzed is essential for reproducible workflows, and yet we have no good solution for workflows with unpublished data or private workflows to do this once the data is too large for version control (e.g. files > 50 mb).

    -
      -
    • Who is the target audience and what are scientific applications of this package?
    • -
    -

    The target audience is anyone working with data files on GitHub.

    - -

    datastorr on ropenscilabs is the closest match, which takes a very different approach (from the user perspective – on the back end both store data on GitHub assets) to the essentially the same problem. The Intro vignette discusses at greater length many of the alternative possible strategies and why I feel they have all fallen short of my needs and led to me creating this package.

    -
      -
    • If you made a pre-submission enquiry, please paste the link to the corresponding issue, forum post, or other discussion, or @tag the editor you contacted.
    • -
    -
    -
    -

    -Requirements

    -

    Confirm each of the following by checking the box. This package:

    -
      -
    • [x] does not violate the Terms of Service of any service it interacts with.
    • -
    • [x] has a CRAN and OSI accepted license.
    • -
    • [x] contains a README with instructions for installing the development version.
    • -
    • [x] includes documentation with examples for all functions.
    • -
    • [x] contains a vignette with examples of its essential functions and uses.
    • -
    • [x] has a test suite.
    • -
    • [x] has continuous integration, including reporting of test coverage, using services such as Travis CI, Coveralls and/or CodeCov.
    • -
    • [x] I agree to abide by ROpenSci’s Code of Conduct during the review process and in maintaining my package should it be accepted.
    • -
    -
    -

    -Publication options

    -
      -
    • [x] Do you intend for this package to go on CRAN?
    • -
    • [x] Do you wish to automatically submit to the Journal of Open Source Software? If so: -
        -
      • [x] The package has an obvious research application according to JOSS’s definition.
      • -
      • [x] The package contains a paper.md matching JOSS’s requirements with a high-level description in the package root or in inst/.
      • -
      • [ ] The package is deposited in a long-term repository with the DOI:
      • -
      • (Do not submit your package separately to JOSS)
      • -
      -
    • -
    • [ ] Do you wish to submit an Applications Article about your package to Methods in Ecology and Evolution? If so: -
        -
      • [ ] The package is novel and will be of interest to the broad readership of the journal.
      • -
      • [ ] The manuscript describing the package is no longer than 3000 words.
      • -
      • [ ] You intend to archive the code for the package in a long-term repository which meets the requirements of the journal (see MEE’s Policy on Publishing Code)
      • -
      • (Scope: Do consider MEE’s Aims and Scope for your manuscript. We make no gaurantee that your manuscript willl be within MEE scope.)
      • -
      • (Although not required, we strongly recommend having a full manuscript prepared when you submit here.)
      • -
      • (Please do not submit your package separately to Methods in Ecology and Evolution)
      • -
      -
    • -
    -
    -
    -
    -

    -Detail

    -
      -
    • x ] Does R CMD check (or devtools::check()) succeed? Paste and describe any errors or warnings:
    • -
    -

    No errors, notes, or warnings.

    -
      -
    • [x] Does the package conform to rOpenSci packaging guidelines? Please describe any exceptions:

    • -
    • If this is a resubmission following rejection, please explain the change in circumstances:

    • -
    • If possible, please provide recommendations of reviewers - those with experience with similar packages and/or likely users of your package - and their GitHub user names:

    • -
    -

    Rich FitzJohn, @richfitz, would be great based on his experience in this area and with datastorr. Jenny Bryan, @Jennybc, since this package makes heavy use of usethis and GitHub interactions.

    -
    - - -
    - -
    - - -
    - - -
    -

    Site built with pkgdown.

    -
    - -
    -
    - - - - - - diff --git a/docs/paper.html b/docs/paper.html deleted file mode 100644 index c42a881..0000000 --- a/docs/paper.html +++ /dev/null @@ -1,103 +0,0 @@ - -Piggyback: Working with larger data in GitHub • piggyback - - -
    -
    - - - -
    -
    - - - -
    - -

    GitHub has become a central component for preserving and sharing software-driven analysis in academic research [@Ram2013]. As scientists adopt this workflow, a desire to manage data associated with the analysis in the same manner soon emerges. While small data can easily be committed to GitHub repositories along-side source code and analysis scripts, files larger than 50 MB cannot. Existing work-arounds introduce significant complexity and break the ease of sharing [@Boettiger2018].

    -

    This package provides a simple work-around by allowing larger (up to 2 GB) data files to piggyback on a repository as assets attached to individual GitHub releases. piggyback provides a workflow similar to Git LFS [@GitLFS], in which data files can be tracked by type and pushed and pulled to GitHub with dedicated commands. These files are not handled by git in any way, but instead are uploaded, downloaded, or edited directly by calls through the GitHub API [@API3]. These data files can be versioned manually by creating different releases. This approach works equally well with public or private repositories. Data can be uploaded and downloaded programmatically from scripts. No authentication is required to download data from public repositories.

    -
    -

    Examples

    -

    As long as a repository has at least one release, users can upload a set of specified files from the current repository to that release by simply passing the file names to pb_upload(). Specify individual files to download using pb_download(), or use no arguments to download all data files attached to the latest release. Alternatively, users can track files by a given pattern: for instance, pb_track("*.csv") will track all *.csv files in the repository. Then use pb_upload(pb_track()) to upload all currently tracked files. piggyback compares timestamps to avoid unnecessary transfer. The piggyback package looks for the same GITHUB_TOKEN environmental variable for authentication that is used across GitHub APIs. Details are provided in an introductory vignette [@Boettiger2018b].

    -
    -
    -
    -

    References

    -
    - - -
    - - - -
    - - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/pkgdown.css b/docs/pkgdown.css deleted file mode 100644 index 80ea5b8..0000000 --- a/docs/pkgdown.css +++ /dev/null @@ -1,384 +0,0 @@ -/* Sticky footer */ - -/** - * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ - * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css - * - * .Site -> body > .container - * .Site-content -> body > .container .row - * .footer -> footer - * - * Key idea seems to be to ensure that .container and __all its parents__ - * have height set to 100% - * - */ - -html, body { - height: 100%; -} - -body { - position: relative; -} - -body > .container { - display: flex; - height: 100%; - flex-direction: column; -} - -body > .container .row { - flex: 1 0 auto; -} - -footer { - margin-top: 45px; - padding: 35px 0 36px; - border-top: 1px solid #e5e5e5; - color: #666; - display: flex; - flex-shrink: 0; -} -footer p { - margin-bottom: 0; -} -footer div { - flex: 1; -} -footer .pkgdown { - text-align: right; -} -footer p { - margin-bottom: 0; -} - -img.icon { - float: right; -} - -/* Ensure in-page images don't run outside their container */ -.contents img { - max-width: 100%; - height: auto; -} - -/* Fix bug in bootstrap (only seen in firefox) */ -summary { - display: list-item; -} - -/* Typographic tweaking ---------------------------------*/ - -.contents .page-header { - margin-top: calc(-60px + 1em); -} - -dd { - margin-left: 3em; -} - -/* Section anchors ---------------------------------*/ - -a.anchor { - display: none; - margin-left: 5px; - width: 20px; - height: 20px; - - background-image: url(./link.svg); - background-repeat: no-repeat; - background-size: 20px 20px; - background-position: center center; -} - -h1:hover .anchor, -h2:hover .anchor, -h3:hover .anchor, -h4:hover .anchor, -h5:hover .anchor, -h6:hover .anchor { - display: inline-block; -} - -/* Fixes for fixed navbar --------------------------*/ - -.contents h1, .contents h2, .contents h3, .contents h4 { - padding-top: 60px; - margin-top: -40px; -} - -/* Navbar submenu --------------------------*/ - -.dropdown-submenu { - position: relative; -} - -.dropdown-submenu>.dropdown-menu { - top: 0; - left: 100%; - margin-top: -6px; - margin-left: -1px; - border-radius: 0 6px 6px 6px; -} - -.dropdown-submenu:hover>.dropdown-menu { - display: block; -} - -.dropdown-submenu>a:after { - display: block; - content: " "; - float: right; - width: 0; - height: 0; - border-color: transparent; - border-style: solid; - border-width: 5px 0 5px 5px; - border-left-color: #cccccc; - margin-top: 5px; - margin-right: -10px; -} - -.dropdown-submenu:hover>a:after { - border-left-color: #ffffff; -} - -.dropdown-submenu.pull-left { - float: none; -} - -.dropdown-submenu.pull-left>.dropdown-menu { - left: -100%; - margin-left: 10px; - border-radius: 6px 0 6px 6px; -} - -/* Sidebar --------------------------*/ - -#pkgdown-sidebar { - margin-top: 30px; - position: -webkit-sticky; - position: sticky; - top: 70px; -} - -#pkgdown-sidebar h2 { - font-size: 1.5em; - margin-top: 1em; -} - -#pkgdown-sidebar h2:first-child { - margin-top: 0; -} - -#pkgdown-sidebar .list-unstyled li { - margin-bottom: 0.5em; -} - -/* bootstrap-toc tweaks ------------------------------------------------------*/ - -/* All levels of nav */ - -nav[data-toggle='toc'] .nav > li > a { - padding: 4px 20px 4px 6px; - font-size: 1.5rem; - font-weight: 400; - color: inherit; -} - -nav[data-toggle='toc'] .nav > li > a:hover, -nav[data-toggle='toc'] .nav > li > a:focus { - padding-left: 5px; - color: inherit; - border-left: 1px solid #878787; -} - -nav[data-toggle='toc'] .nav > .active > a, -nav[data-toggle='toc'] .nav > .active:hover > a, -nav[data-toggle='toc'] .nav > .active:focus > a { - padding-left: 5px; - font-size: 1.5rem; - font-weight: 400; - color: inherit; - border-left: 2px solid #878787; -} - -/* Nav: second level (shown on .active) */ - -nav[data-toggle='toc'] .nav .nav { - display: none; /* Hide by default, but at >768px, show it */ - padding-bottom: 10px; -} - -nav[data-toggle='toc'] .nav .nav > li > a { - padding-left: 16px; - font-size: 1.35rem; -} - -nav[data-toggle='toc'] .nav .nav > li > a:hover, -nav[data-toggle='toc'] .nav .nav > li > a:focus { - padding-left: 15px; -} - -nav[data-toggle='toc'] .nav .nav > .active > a, -nav[data-toggle='toc'] .nav .nav > .active:hover > a, -nav[data-toggle='toc'] .nav .nav > .active:focus > a { - padding-left: 15px; - font-weight: 500; - font-size: 1.35rem; -} - -/* orcid ------------------------------------------------------------------- */ - -.orcid { - font-size: 16px; - color: #A6CE39; - /* margins are required by official ORCID trademark and display guidelines */ - margin-left:4px; - margin-right:4px; - vertical-align: middle; -} - -/* Reference index & topics ----------------------------------------------- */ - -.ref-index th {font-weight: normal;} - -.ref-index td {vertical-align: top; min-width: 100px} -.ref-index .icon {width: 40px;} -.ref-index .alias {width: 40%;} -.ref-index-icons .alias {width: calc(40% - 40px);} -.ref-index .title {width: 60%;} - -.ref-arguments th {text-align: right; padding-right: 10px;} -.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} -.ref-arguments .name {width: 20%;} -.ref-arguments .desc {width: 80%;} - -/* Nice scrolling for wide elements --------------------------------------- */ - -table { - display: block; - overflow: auto; -} - -/* Syntax highlighting ---------------------------------------------------- */ - -pre, code, pre code { - background-color: #f8f8f8; - color: #333; -} -pre, pre code { - white-space: pre-wrap; - word-break: break-all; - overflow-wrap: break-word; -} - -pre { - border: 1px solid #eee; -} - -pre .img, pre .r-plt { - margin: 5px 0; -} - -pre .img img, pre .r-plt img { - background-color: #fff; -} - -code a, pre a { - color: #375f84; -} - -a.sourceLine:hover { - text-decoration: none; -} - -.fl {color: #1514b5;} -.fu {color: #000000;} /* function */ -.ch,.st {color: #036a07;} /* string */ -.kw {color: #264D66;} /* keyword */ -.co {color: #888888;} /* comment */ - -.error {font-weight: bolder;} -.warning {font-weight: bolder;} - -/* Clipboard --------------------------*/ - -.hasCopyButton { - position: relative; -} - -.btn-copy-ex { - position: absolute; - right: 0; - top: 0; - visibility: hidden; -} - -.hasCopyButton:hover button.btn-copy-ex { - visibility: visible; -} - -/* headroom.js ------------------------ */ - -.headroom { - will-change: transform; - transition: transform 200ms linear; -} -.headroom--pinned { - transform: translateY(0%); -} -.headroom--unpinned { - transform: translateY(-100%); -} - -/* mark.js ----------------------------*/ - -mark { - background-color: rgba(255, 255, 51, 0.5); - border-bottom: 2px solid rgba(255, 153, 51, 0.3); - padding: 1px; -} - -/* vertical spacing after htmlwidgets */ -.html-widget { - margin-bottom: 10px; -} - -/* fontawesome ------------------------ */ - -.fab { - font-family: "Font Awesome 5 Brands" !important; -} - -/* don't display links in code chunks when printing */ -/* source: https://stackoverflow.com/a/10781533 */ -@media print { - code a:link:after, code a:visited:after { - content: ""; - } -} - -/* Section anchors --------------------------------- - Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 -*/ - -div.csl-bib-body { } -div.csl-entry { - clear: both; -} -.hanging-indent div.csl-entry { - margin-left:2em; - text-indent:-2em; -} -div.csl-left-margin { - min-width:2em; - float:left; -} -div.csl-right-inline { - margin-left:2em; - padding-left:1em; -} -div.csl-indent { - margin-left: 2em; -} diff --git a/docs/pkgdown.js b/docs/pkgdown.js deleted file mode 100644 index 6f0eee4..0000000 --- a/docs/pkgdown.js +++ /dev/null @@ -1,108 +0,0 @@ -/* http://gregfranko.com/blog/jquery-best-practices/ */ -(function($) { - $(function() { - - $('.navbar-fixed-top').headroom(); - - $('body').css('padding-top', $('.navbar').height() + 10); - $(window).resize(function(){ - $('body').css('padding-top', $('.navbar').height() + 10); - }); - - $('[data-toggle="tooltip"]').tooltip(); - - var cur_path = paths(location.pathname); - var links = $("#navbar ul li a"); - var max_length = -1; - var pos = -1; - for (var i = 0; i < links.length; i++) { - if (links[i].getAttribute("href") === "#") - continue; - // Ignore external links - if (links[i].host !== location.host) - continue; - - var nav_path = paths(links[i].pathname); - - var length = prefix_length(nav_path, cur_path); - if (length > max_length) { - max_length = length; - pos = i; - } - } - - // Add class to parent
  • , and enclosing
  • if in dropdown - if (pos >= 0) { - var menu_anchor = $(links[pos]); - menu_anchor.parent().addClass("active"); - menu_anchor.closest("li.dropdown").addClass("active"); - } - }); - - function paths(pathname) { - var pieces = pathname.split("/"); - pieces.shift(); // always starts with / - - var end = pieces[pieces.length - 1]; - if (end === "index.html" || end === "") - pieces.pop(); - return(pieces); - } - - // Returns -1 if not found - function prefix_length(needle, haystack) { - if (needle.length > haystack.length) - return(-1); - - // Special case for length-0 haystack, since for loop won't run - if (haystack.length === 0) { - return(needle.length === 0 ? 0 : -1); - } - - for (var i = 0; i < haystack.length; i++) { - if (needle[i] != haystack[i]) - return(i); - } - - return(haystack.length); - } - - /* Clipboard --------------------------*/ - - function changeTooltipMessage(element, msg) { - var tooltipOriginalTitle=element.getAttribute('data-original-title'); - element.setAttribute('data-original-title', msg); - $(element).tooltip('show'); - element.setAttribute('data-original-title', tooltipOriginalTitle); - } - - if(ClipboardJS.isSupported()) { - $(document).ready(function() { - var copyButton = ""; - - $("div.sourceCode").addClass("hasCopyButton"); - - // Insert copy buttons: - $(copyButton).prependTo(".hasCopyButton"); - - // Initialize tooltips: - $('.btn-copy-ex').tooltip({container: 'body'}); - - // Initialize clipboard: - var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { - text: function(trigger) { - return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); - } - }); - - clipboardBtnCopies.on('success', function(e) { - changeTooltipMessage(e.trigger, 'Copied!'); - e.clearSelection(); - }); - - clipboardBtnCopies.on('error', function() { - changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); - }); - }); - } -})(window.jQuery || window.$) diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml deleted file mode 100644 index a527a05..0000000 --- a/docs/pkgdown.yml +++ /dev/null @@ -1,8 +0,0 @@ -pandoc: 2.11.4 -pkgdown: 2.0.1 -pkgdown_sha: ~ -articles: - alternatives: alternatives.html - intro: intro.html -last_built: 2022-02-09T19:01Z - diff --git a/docs/reference/Rplot001.png b/docs/reference/Rplot001.png deleted file mode 100644 index 17a3580..0000000 Binary files a/docs/reference/Rplot001.png and /dev/null differ diff --git a/docs/reference/figures/github-policy.png b/docs/reference/figures/github-policy.png deleted file mode 100644 index b08345b..0000000 Binary files a/docs/reference/figures/github-policy.png and /dev/null differ diff --git a/docs/reference/figures/logo.png b/docs/reference/figures/logo.png deleted file mode 100644 index 0262138..0000000 Binary files a/docs/reference/figures/logo.png and /dev/null differ diff --git a/docs/reference/figures/logo.svg b/docs/reference/figures/logo.svg deleted file mode 100644 index 462fc18..0000000 --- a/docs/reference/figures/logo.svg +++ /dev/null @@ -1,295 +0,0 @@ - - - - - - image/svg+xml - - RStudio_Hex 2016 v7 outlines - - - - - - - - - - - RStudio_Hex 2016 v7 outlines - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/reference/index.html b/docs/reference/index.html deleted file mode 100644 index 94e30d4..0000000 --- a/docs/reference/index.html +++ /dev/null @@ -1,118 +0,0 @@ - -Function reference • piggyback - - -
    -
    - - - -
    -
    - - - - - - - - - - - - - - - - - -
    -

    All functions

    -

    -
    -

    pb_delete()

    -

    Delete an asset attached to a release

    -

    pb_download()

    -

    Download data from an existing release

    -

    pb_download_url()

    -

    Get the download url of a given file

    -

    pb_list()

    -

    List all assets attached to a release

    -

    pb_new_release()

    -

    Create a new release on GitHub repo

    -

    pb_upload()

    -

    Upload data to an existing release

    -

    piggyback-package

    -

    piggyback: Managing Larger Data on a GitHub Repository

    - - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/reference/manifest.json b/docs/reference/manifest.json deleted file mode 100644 index 1729967..0000000 --- a/docs/reference/manifest.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "data/data/iris.tsv.gz": "70e13d6b359633296f73b60f600dd3f7", - "data/data/mtcars.tsv.gz": "10bd19ad61e72b9ba14c6d4f0d26e651", - "data/iris.tsv.gz": "70e13d6b359633296f73b60f600dd3f7", - "data/mtcars.tsv.gz": "10bd19ad61e72b9ba14c6d4f0d26e651", - "mtcars.tsv.gz": "10bd19ad61e72b9ba14c6d4f0d26e651", - "tests/testthat/data/iris.tsv.gz": "70e13d6b359633296f73b60f600dd3f7", - "tests/testthat/data/mtcars.tsv.gz": "10bd19ad61e72b9ba14c6d4f0d26e651" -} diff --git a/docs/reference/mtcars.tsv.xz b/docs/reference/mtcars.tsv.xz deleted file mode 100644 index 43c151d..0000000 Binary files a/docs/reference/mtcars.tsv.xz and /dev/null differ diff --git a/docs/reference/pb_delete.html b/docs/reference/pb_delete.html deleted file mode 100644 index 53e1694..0000000 --- a/docs/reference/pb_delete.html +++ /dev/null @@ -1,135 +0,0 @@ - -Delete an asset attached to a release — pb_delete • piggyback - - -
    -
    - - - -
    -
    - - -
    -

    Delete an asset attached to a release

    -
    - -
    -
    pb_delete(
    -  file = NULL,
    -  repo = guess_repo(),
    -  tag = "latest",
    -  .token = get_token()
    -)
    -
    - -
    -

    Arguments

    -
    file
    -

    file(s) to be deleted from the release. If NULL (default -when argument is omitted), function will delete all attachments to the release. -delete

    -
    repo
    -

    Repository name in format "owner/repo". Will guess the current -repo if not specified.

    -
    tag
    -

    tag for the GitHub release to which this data should be attached.

    -
    .token
    -

    GitHub authentication token, see [gh::gh_token()]

    -
    -
    -

    Value

    -

    TRUE (invisibly) if a file is found and deleted. -Otherwise, returns NULL (invisibly) if no file matching the name was found.

    -
    - -
    -

    Examples

    -
    if (FALSE) {
    -readr::write_tsv(mtcars, "mtcars.tsv.gz")
    -## Upload
    -pb_upload("mtcars.tsv.gz",
    -          repo = "cboettig/piggyback-tests",
    -           overwrite = TRUE)
    -pb_delete("mtcars.tsv.gz",
    -          repo = "cboettig/piggyback-tests",
    -          tag = "v0.0.1")
    -}
    -
    -
    -
    -
    - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/reference/pb_download.html b/docs/reference/pb_download.html deleted file mode 100644 index f3865df..0000000 --- a/docs/reference/pb_download.html +++ /dev/null @@ -1,151 +0,0 @@ - -Download data from an existing release — pb_download • piggyback - - -
    -
    - - - -
    -
    - - -
    -

    Download data from an existing release

    -
    - -
    -
    pb_download(
    -  file = NULL,
    -  dest = ".",
    -  repo = guess_repo(),
    -  tag = "latest",
    -  overwrite = TRUE,
    -  ignore = "manifest.json",
    -  use_timestamps = TRUE,
    -  show_progress = TRUE,
    -  .token = get_token()
    -)
    -
    - -
    -

    Arguments

    -
    file
    -

    name or vector of names of files to be downloaded. If NULL, -all assets attached to the release will be downloaded.

    -
    dest
    -

    name of vector of names of where file should be downloaded. -Can be a directory or a list of filenames the same length as file -vector. Any directories in the path provided must already exist.

    -
    repo
    -

    Repository name in format "owner/repo". Will guess the current -repo if not specified.

    -
    tag
    -

    tag for the GitHub release to which this data should be attached.

    -
    overwrite
    -

    Should any local files of the same name be overwritten? -default TRUE.

    -
    ignore
    -

    a list of files to ignore (if downloading "all" because -file=NULL).

    -
    use_timestamps
    -

    DEPRECATED.

    -
    show_progress
    -

    logical, show a progress bar be shown for uploading? -Defaults to TRUE.

    -
    .token
    -

    GitHub authentication token, see [gh::gh_token()]

    -
    - -
    -

    Examples

    -
    if (FALSE) {
    - ## Download a specific file.
    - ## (dest can be omitted when run inside and R project)
    - piggyback::pb_download("iris.tsv.gz",
    -                        repo = "cboettig/piggyback-tests",
    -                        dest = tempdir())
    -}
    -if (FALSE) {
    - ## Download all files
    - piggyback::pb_download(repo = "cboettig/piggyback-tests",
    -                        dest = tempdir())
    -
    -}
    -
    -
    -
    - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/reference/pb_download_url.html b/docs/reference/pb_download_url.html deleted file mode 100644 index 756c4ee..0000000 --- a/docs/reference/pb_download_url.html +++ /dev/null @@ -1,133 +0,0 @@ - -Get the download url of a given file — pb_download_url • piggyback - - -
    -
    - - - -
    -
    - - -
    -

    Returns the URL download for a public file. This can be useful when writing -scripts that may want to download the file directly without introducing any -dependency on piggyback or authentication steps.

    -
    - -
    -
    pb_download_url(
    -  file = NULL,
    -  repo = guess_repo(),
    -  tag = "latest",
    -  .token = get_token()
    -)
    -
    - -
    -

    Arguments

    -
    file
    -

    name or vector of names of files to be downloaded. If NULL, -all assets attached to the release will be downloaded.

    -
    repo
    -

    Repository name in format "owner/repo". Will guess the current -repo if not specified.

    -
    tag
    -

    tag for the GitHub release to which this data should be attached.

    -
    .token
    -

    GitHub authentication token, see [gh::gh_token()]

    -
    -
    -

    Value

    -

    the URL to download a file

    -
    - -
    -

    Examples

    -
    if (FALSE) {
    -
    -pb_download_url("iris.tsv.xz",
    -                repo = "cboettig/piggyback-tests",
    -                tag = "v0.0.1")
    -
    -}
    -
    -
    -
    - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/reference/pb_list.html b/docs/reference/pb_list.html deleted file mode 100644 index 27d8ef4..0000000 --- a/docs/reference/pb_list.html +++ /dev/null @@ -1,133 +0,0 @@ - -List all assets attached to a release — pb_list • piggyback - - -
    -
    - - - -
    -
    - - -
    -

    List all assets attached to a release

    -
    - -
    -
    pb_list(
    -  repo = guess_repo(),
    -  tag = NULL,
    -  ignore = "manifest.json",
    -  .token = get_token()
    -)
    -
    - -
    -

    Arguments

    -
    repo
    -

    Repository name in format "owner/repo". Will guess the current -repo if not specified.

    -
    tag
    -

    which release tag do we want information for? If NULL (default), -will return a table for all available release tags.

    -
    ignore
    -

    a list of files to ignore (if downloading "all" because -file=NULL).

    -
    .token
    -

    GitHub authentication token, see [gh::gh_token()]

    -
    -
    -

    Value

    -

    a data.frame of release asset names, (normalized to local paths), release tag, -timestamp, owner, and repo.

    -
    -
    -

    Details

    -

    To preserve path information, local path delimiters are converted to .2f -when files are uploaded as assets. Listing will display the local filename, -with asset names converting the .2f escape code back to the system delimiter.

    -
    - -
    -

    Examples

    -
    if (FALSE) {
    -pb_list("cboettig/piggyback-tests")
    -}
    -
    -
    -
    - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/reference/pb_new_release.html b/docs/reference/pb_new_release.html deleted file mode 100644 index 5b0a8df..0000000 --- a/docs/reference/pb_new_release.html +++ /dev/null @@ -1,139 +0,0 @@ - -Create a new release on GitHub repo — pb_new_release • piggyback - - -
    -
    - - - -
    -
    - - -
    -

    Create a new release on GitHub repo

    -
    - -
    -
    pb_new_release(
    -  repo = guess_repo(),
    -  tag,
    -  commit = NULL,
    -  name = tag,
    -  body = "Data release",
    -  draft = FALSE,
    -  prerelease = FALSE,
    -  .token = get_token()
    -)
    -
    - -
    -

    Arguments

    -
    repo
    -

    Repository name in format "owner/repo". Will guess -the current repo if not specified.

    -
    tag
    -

    tag to create for this release

    -
    commit
    -

    Specifies the commit-ish value that -determines where the Git tag is created from. -Can be any branch or commit SHA. Unused if the -git tag already exists. Default: the repository's -default branch (usually master).

    -
    name
    -

    The name of the release. Defaults to tag.

    -
    body
    -

    Text describing the contents of the tag. -default text is "Data release".

    -
    draft
    -

    default FALSE. Set to TRUE to create -a draft (unpublished) release.

    -
    prerelease
    -

    default FALSE. Set to TRUE to -identify the release as a pre-release.

    -
    .token
    -

    GitHub authentication token, see [gh::gh_token()]

    -
    - -
    -

    Examples

    -
    if (FALSE) {
    -pb_new_release("cboettig/piggyback-tests", "v0.0.5")
    -}
    -
    -
    -
    - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/reference/pb_pull.html b/docs/reference/pb_pull.html deleted file mode 100644 index b1b0b9d..0000000 --- a/docs/reference/pb_pull.html +++ /dev/null @@ -1,198 +0,0 @@ - - - - - - - - -Pull data from GitHub — pb_pull • piggyback - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Download any tracked datasets piggybacking on GitHub. Files identical on -local and remote versions will not be transferred. Otherwise, assumes -GitHub version should overwrite local versions.

    - -
    - -
    pb_pull(repo = guess_repo(), tag = "latest", overwrite = TRUE,
    -  manifest = ".manifest.json")
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    repo

    Name of the repo on GitHub (owner/repo, i.e. -cboettig/piggyback). By default will guess the current repository's -GitHub origin.

    tag

    name of release/tag on GitHub to which data assets will be -attached. Default is to use the latest available release.

    overwrite

    should existing files be overwritten when hashes do -not match? default TRUE.

    manifest

    name of the local manifest file. Note: A leading dot -(i.e. indicating a hidden file) in the manifest name will be removed -from the name used on the GitHub asset list.

    - -

    Details

    - -

    Will only download tracked files, as identified by the manifest -attached to the requested release on GitHub. Add files to tracking with -pb_track first and push to GitHub with pb_push.

    - - -

    Examples

    -
    # NOT RUN {
    -pb_pull()
    -# }
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown.

    -
    - -
    -
    - - - - - - diff --git a/docs/reference/pb_push.html b/docs/reference/pb_push.html deleted file mode 100644 index 1fbd3cb..0000000 --- a/docs/reference/pb_push.html +++ /dev/null @@ -1,201 +0,0 @@ - - - - - - - - -Push data to GitHub — pb_push • piggyback - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Push all currently tracked data files to GitHub. Only files identical -to those already on GitHub (by md5sum hash) will not be transferred. -Otherwise, assumes local version should overwrite existing GitHub -version. Create a new release if you do not want to overwrite previous -GitHub versions when pushing.

    - -
    - -
    pb_push(repo = guess_repo(), tag = "latest", overwrite = TRUE,
    -  manifest = ".manifest.json")
    - -

    Arguments

    - - - - - - - - - - - - - - - - - - -
    repo

    Name of the repo on GitHub (owner/repo, i.e. -cboettig/piggyback). By default will guess the current repository's -GitHub origin.

    tag

    name of release/tag on GitHub to which data assets will be -attached. Default is to use the latest available release.

    overwrite

    should existing files be overwritten when hashes do -not match? default TRUE.

    manifest

    name of the local manifest file. Note: A leading dot -(i.e. indicating a hidden file) in the manifest name will be removed -from the name used on the GitHub asset list.

    - -

    Details

    - -

    Will only upload tracked files, as identified by the local -manifest. Add files to tracking with pb_track first.

    - - -

    Examples

    -
    # NOT RUN {
    -pb_push()
    -# }
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown.

    -
    - -
    -
    - - - - - - diff --git a/docs/reference/pb_track.html b/docs/reference/pb_track.html deleted file mode 100644 index 636ec2e..0000000 --- a/docs/reference/pb_track.html +++ /dev/null @@ -1,200 +0,0 @@ - - - - - - - - -Track data files of a given pattern or location — pb_track • piggyback - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    Track data files of a given pattern or location

    - -
    - -
    pb_track(glob = NULL, repo_root = usethis::proj_get())
    - -

    Arguments

    - - - - - - - - - - -
    glob

    vector of file names and/or glob pattern (e.g. *.csv, data/*.csv) -which will be tracked by piggyback. Omit (default NULL) to just return -a list of files currently tracked.

    repo_root

    repository root, will be guessed by usethis otherwise.

    - -

    Value

    - -

    list of tracked files (invisibly)

    - -

    Details

    - -

    Note: tracked patterns are simply written to .pbattributes -(analogous to .gitattributes in git-lfs.) You can also edit this -file manually. You will probably want to check in .psattributes to -as to version control., with git add .psattributes. Note that -tracked file patterns will also be added to .gitignore.

    - - -

    Examples

    -
    # NOT RUN {
    -## Track all .csv and .tsv files
    -pb_track(c("*.tsv", "*.tsv.gz"))
    -
    -# }
    -
    - -
    - -
    - - -
    -

    Site built with pkgdown 1.3.0.9000.

    -
    -
    -
    - - - - - - diff --git a/docs/reference/pb_upload.html b/docs/reference/pb_upload.html deleted file mode 100644 index 659a946..0000000 --- a/docs/reference/pb_upload.html +++ /dev/null @@ -1,142 +0,0 @@ - -Upload data to an existing release — pb_upload • piggyback - - -
    -
    - - - -
    -
    - - -
    -

    NOTE: you must first create a release if one does not already exists.

    -
    - -
    -
    pb_upload(
    -  file,
    -  repo = guess_repo(),
    -  tag = "latest",
    -  name = NULL,
    -  overwrite = "use_timestamps",
    -  use_timestamps = NULL,
    -  show_progress = TRUE,
    -  .token = get_token(),
    -  dir = "."
    -)
    -
    - -
    -

    Arguments

    -
    file
    -

    path to file to be uploaded

    -
    repo
    -

    Repository name in format "owner/repo". Will guess the current -repo if not specified.

    -
    tag
    -

    tag for the GitHub release to which this data should be attached.

    -
    name
    -

    name for uploaded file. If not provided will use the basename of -file (i.e. filename without directory)

    -
    overwrite
    -

    overwrite any existing file with the same name already -attached to the on release? Default behavior is based on timestamps, -only overwriting those files which are older.

    -
    use_timestamps
    -

    DEPRECATED.

    -
    show_progress
    -

    logical, show a progress bar be shown for uploading? -Defaults to TRUE.

    -
    .token
    -

    GitHub authentication token, see [gh::gh_token()]

    -
    dir
    -

    directory relative to which file names should be based.

    -
    - -
    -

    Examples

    -
    if (FALSE) {
    -# Needs your real token to run
    -
    -readr::write_tsv(mtcars,"mtcars.tsv.xz")
    -pb_upload("mtcars.tsv.xz", "cboettig/piggyback-tests")
    -}
    -
    -
    -
    - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/reference/piggyback-package.html b/docs/reference/piggyback-package.html deleted file mode 100644 index f6ea922..0000000 --- a/docs/reference/piggyback-package.html +++ /dev/null @@ -1,129 +0,0 @@ - -piggyback: Managing Larger Data on a GitHub Repository — piggyback-package • piggyback - - -
    -
    - - - -
    -
    - - -
    -

    Because larger (> 50 MB) data files cannot easily be committed to git, -a different approach is required to manage data associated with an analysis in a -GitHub repository. This package provides a simple work-around by allowing larger -(up to 2 GB) data files to piggyback on a repository as assets attached to individual -GitHub releases. These files are not handled by git in any way, but instead are -uploaded, downloaded, or edited directly by calls through the GitHub API. These -data files can be versioned manually by creating different releases. This approach -works equally well with public or private repositories. Data can be uploaded -and downloaded programmatically from scripts. No authentication is required to -download data from public repositories.

    -
    - - -
    -

    Details

    -

    It has two main modes or workflows:

    • pb_upload() / pb_download(): Upload and download individual files to/from -the desired release of the specified repository

    • -
    - -
    -

    Author

    -

    Maintainer: Carl Boettiger cboettig@gmail.com (ORCID) [copyright holder]

    -

    Other contributors:

    • Mark Padgham (ORCID) [contributor]

    • -
    • Jeffrey O Hanson (ORCID) [contributor]

    • -
    • Kevin Kuo (ORCID) [contributor]

    • -
    - -
    - -
    - - -
    - -
    -

    Site built with pkgdown 2.0.1.

    -
    - -
    - - - - - - - - diff --git a/docs/sitemap.xml b/docs/sitemap.xml deleted file mode 100644 index 014000d..0000000 --- a/docs/sitemap.xml +++ /dev/null @@ -1,69 +0,0 @@ - - - - /404.html - - - /articles/alternatives.html - - - /articles/index.html - - - /articles/intro.html - - - /authors.html - - - /CODE_OF_CONDUCT.html - - - /index.html - - - /LICENSE.html - - - /news/index.html - - - /onboarding-submission.html - - - /paper.html - - - /reference/index.html - - - /reference/pb_delete.html - - - /reference/pb_download.html - - - /reference/pb_download_url.html - - - /reference/pb_list.html - - - /reference/pb_new_release.html - - - /reference/pb_pull.html - - - /reference/pb_push.html - - - /reference/pb_track.html - - - /reference/pb_upload.html - - - /reference/piggyback-package.html - - diff --git a/man/guess_read_function.Rd b/man/guess_read_function.Rd new file mode 100644 index 0000000..34480de --- /dev/null +++ b/man/guess_read_function.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pb_read.R +\name{guess_read_function} +\alias{guess_read_function} +\title{Guess read function from file extension} +\usage{ +guess_read_function(file) +} +\arguments{ +\item{file}{filename to parse} +} +\value{ +function for reading the file, if found +} +\description{ +This function accepts a filename and tries to return a valid function for +reading it. +} +\details{ +\code{guess_read_function} understands the following file extensions: +\itemize{ +\item rds with \code{readRDS} +\item csv, csv.gz, csv.xz with \code{utils::read.csv} +\item tsv, tsv.gz, tsv.xz with \code{utils::read.delim} +\item parquet with \code{arrow::read_parquet} +\item txt, txt.gz, txt.xz with \code{readLines} +\item json, json.gz, json.xz with \code{jsonlite::fromJSON} +} +} +\seealso{ +Other pb_rw: +\code{\link{guess_write_function}()}, +\code{\link{pb_read}()}, +\code{\link{pb_write}()} +} +\concept{pb_rw} +\keyword{internal} diff --git a/man/guess_write_function.Rd b/man/guess_write_function.Rd new file mode 100644 index 0000000..72ee984 --- /dev/null +++ b/man/guess_write_function.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pb_write.R +\name{guess_write_function} +\alias{guess_write_function} +\title{Guess write function from file extension} +\usage{ +guess_write_function(file) +} +\arguments{ +\item{file}{filename to parse} +} +\value{ +function for reading the file, if found +} +\description{ +This function accepts a filename and tries to return a valid function for +writing to it. +} +\details{ +\code{guess_write_function} understands the following file extensions: +\itemize{ +\item rds with \code{saveRDS} +\item csv, csv.gz, csv.xz with \code{utils::write.csv} +\item tsv, tsv.gz, tsv.xz with a modified \code{utils::write.csv} where sep is set to \code{"\\t"} +\item parquet with \code{arrow::write_parquet} +\item txt, txt.gz, txt.xz with \code{writeLines} +\item json, json.gz, json.xz with \code{jsonlite::write_json} +} +} +\seealso{ +Other pb_rw: +\code{\link{guess_read_function}()}, +\code{\link{pb_read}()}, +\code{\link{pb_write}()} +} +\concept{pb_rw} +\keyword{internal} diff --git a/man/pb_download_url.Rd b/man/pb_download_url.Rd index 0172211..757d10d 100644 --- a/man/pb_download_url.Rd +++ b/man/pb_download_url.Rd @@ -39,11 +39,34 @@ functions that are able to accept URLs. \donttest{ \dontshow{try(\{} -# returns browser url by default -pb_download_url("iris.tsv.xz", repo = "cboettig/piggyback-tests", tag = "v0.0.1") +# returns browser url by default (and all files if none are specified) +browser_url <- pb_download_url( + repo = "tanho63/piggyback-tests", + tag = "v0.0.2" + ) +print(browser_url) +utils::read.csv(browser_url[[1]]) # can return api url if desired -pb_download_url("iris.tsv.xz", repo = "cboettig/piggyback-tests", tag = "v0.0.1", url_type = "api") +api_url <- pb_download_url( + "mtcars.csv", + repo = "tanho63/piggyback-tests", + tag = "v0.0.2" + ) +print(api_url) + +# for public repositories, this will still work +utils::read.csv(api_url) + +# for private repos, can use httr or curl to fetch and then pass into read function +gh_pat <- Sys.getenv("GITHUB_PAT") + +if(!identical(gh_pat, "")){ + resp <- httr::GET(api_url, httr::add_headers(Authorization = paste("Bearer", gh_pat))) + utils::read.csv(text = httr::content(resp, as = "text")) +} + +# or use pb_read which bundles some of this for you \dontshow{\})} } diff --git a/man/pb_read.Rd b/man/pb_read.Rd new file mode 100644 index 0000000..27c3c45 --- /dev/null +++ b/man/pb_read.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pb_read.R +\name{pb_read} +\alias{pb_read} +\title{Read one file into memory} +\usage{ +pb_read( + file, + ..., + repo = guess_repo(), + tag = "latest", + read_function = guess_read_function(file), + .token = gh::gh_token() +) +} +\arguments{ +\item{file}{string: file name} + +\item{...}{additional arguments passed to \code{read_function} after file} + +\item{repo}{string: GH repository name in format "owner/repo". Default +\code{guess_repo()} tries to guess based on current working directory's git repo} + +\item{tag}{string: tag for the GH release, defaults to "latest"} + +\item{read_function}{function: used to read in the data, where the file is +passed as the first argument and any additional arguments are subsequently +passed in via \code{...}. Default \code{guess_read_function(file)} will check the file +extension and try to find an appropriate read function if the extension is one +of rds, csv, tsv, parquet, txt, or json, and will abort if not found.} + +\item{.token}{GitHub authentication token, see \code{\link[gh:gh_token]{gh::gh_token()}}} +} +\value{ +Result of reading in the file in question. +} +\description{ +A convenience wrapper around writing an object to a temporary file and then +uploading to a specified repo/release. This convenience comes at a cost to +performance efficiency, since it first downloads the data to disk and then +reads the data from disk into memory. See \code{vignette("cloud_native")} for +alternative ways to bypass this flow and work with the data directly. +} +\examples{ +\donttest{ +try({ # try block is to avoid CRAN issues and is not required in ordinary usage + piggyback::pb_read("mtcars.tsv.gz", repo = "cboettig/piggyback-tests") +}) +} +} +\seealso{ +Other pb_rw: +\code{\link{guess_read_function}()}, +\code{\link{guess_write_function}()}, +\code{\link{pb_write}()} +} +\concept{pb_rw} diff --git a/man/pb_write.Rd b/man/pb_write.Rd new file mode 100644 index 0000000..05d786a --- /dev/null +++ b/man/pb_write.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pb_write.R +\name{pb_write} +\alias{pb_write} +\title{Write one object to repo/release} +\usage{ +pb_write( + x, + file, + ..., + repo = guess_repo(), + tag = "latest", + write_function = guess_write_function(file), + .token = gh::gh_token() +) +} +\arguments{ +\item{x}{object: memory object to save to piggyback} + +\item{file}{string: file name} + +\item{...}{additional arguments passed to \code{write_function}} + +\item{repo}{string: GH repository name in format "owner/repo". Default +\code{guess_repo()} tries to guess based on current working directory's git repo} + +\item{tag}{string: tag for the GH release, defaults to "latest"} + +\item{write_function}{function: used to write an R object to file, where the +object is passed as the first argument, the filename as the second argument, +and any additional arguments are subsequently passed in via \code{...}. Default +\code{guess_write_function(file)} will check the file extension and try to find an +appropriate write function if the extension is one of rds, csv, tsv, parquet, +txt, or json, and will abort if not found.} + +\item{.token}{GitHub authentication token, see \code{\link[gh:gh_token]{gh::gh_token()}}} +} +\value{ +Writes file to release and returns github API response +} +\description{ +A convenience wrapper around writing an object to a temporary file and then +uploading to a specified repo/release. +} +\examples{ +\donttest{ +\dontshow{if (interactive()) \{} + pb_write(mtcars, "mtcars.rds", repo = "tanho63/piggyback-tests") + #> ℹ Uploading to latest release: "v0.0.2". + #> ℹ Uploading mtcars.rds ... + #> |===============================================================| 100\% +\dontshow{\}} +} +} +\seealso{ +Other pb_rw: +\code{\link{guess_read_function}()}, +\code{\link{guess_write_function}()}, +\code{\link{pb_read}()} +} +\concept{pb_rw} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 9edf998..e6ffef8 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -7,6 +7,8 @@ reference: - "pb_download_url" - "pb_upload" - "pb_delete" + - "pb_read" + - "pb_write" - title: Create/Delete Releases desc: Functions for working with releases contents: diff --git a/tests/testthat/test-pb_download.R b/tests/testthat/test-pb_download.R index 7e0aaf4..1051cb5 100644 --- a/tests/testthat/test-pb_download.R +++ b/tests/testthat/test-pb_download.R @@ -126,3 +126,46 @@ test_that("Missing files are reported in download and download_url", { ) }) + +context("pb_read") +test_that("pb_read can read a file from release directly into memory", { + skip_if_offline("api.github.com") + + test_tsv <- pb_read( + file = "iris.tsv.gz", + repo = "cboettig/piggyback-tests", + tag = "v0.0.1", + .token = gh::gh_token() + ) + + expect_equivalent(datasets::iris[[2]], test_tsv[[2]]) +}) + +test_that("pb_read can autodetect different file formats",{ + test_rds <- pb_read( + file = "mtcars.rds", + repo = "tanho63/piggyback-tests", + tag = "v0.0.2" + ) + + expect_equal(nrow(mtcars), nrow(test_rds)) + + skip_if_not_installed("arrow") + test_parquet <- pb_read( + file = "mtcars.parquet", + repo = "tanho63/piggyback-tests", + tag = "v0.0.2" + ) + expect_equal(nrow(mtcars), nrow(test_parquet)) +}) + +test_that("pb_read can accept a custom read_function",{ + skip_if_not_installed("readr") + test_parquet <- pb_read( + file = "mtcars.csv", + repo = "tanho63/piggyback-tests", + tag = "v0.0.2", + read_function = readr::read_csv + ) + expect_equal(nrow(mtcars), nrow(test_parquet)) +}) diff --git a/tests/testthat/test-with_auth.R b/tests/testthat/test-with_auth.R index 4e0bfd3..9ad7dc8 100644 --- a/tests/testthat/test-with_auth.R +++ b/tests/testthat/test-with_auth.R @@ -51,6 +51,60 @@ test_that("Error if we try to create an existing release",{ }) +context("pb_write") +test_that("pb_write can write file from memory to release", { + skippy(TRUE) + skip_if_offline("api.github.com") + + out <- pb_write( + x = mtcars, + file = "mtcars.rds", + repo = test_repo, + tag = test_release_tag, + .token = token + ) + expect_type(out,"list") + expect_equal(out[[1]][["status_code"]], 201) +}) + +test_that("pb_write can autodetect different file formats",{ + out <- pb_write( + x = mtcars, + file = "mtcars.csv", + repo = test_repo, + tag = test_release_tag, + .token = token + ) + + expect_type(out,"list") + expect_equal(out[[1]][["status_code"]], 201) + + skip_if_not_installed("arrow") + out <- pb_write( + x = mtcars, + file = "mtcars.parquet", + repo = test_repo, + tag = test_release_tag, + .token = token + ) + expect_type(out,"list") + expect_equal(out[[1]][["status_code"]], 201) +}) + +test_that("pb_write can accept a custom write_function",{ + skip_if_not_installed("readr") + out <- pb_write( + x = mtcars, + file = "mtcars.csv.gz", + repo = test_repo, + tag = test_release_tag, + .token = token, + write_function = readr::write_csv + ) + expect_type(out,"list") + expect_equal(out[[1]][["status_code"]], 201) +}) + context("File upload") test_that("We can upload data", { @@ -186,6 +240,8 @@ context("File delete") test_that("can delete files from release",{ skippy(TRUE) + count_start <- nrow(pb_info(test_repo, test_release_tag)) + withr::with_options(list(piggyback.verbose = TRUE),{ expect_message( pb_delete(file = basename(upload_files)[[1]], @@ -196,7 +252,8 @@ test_that("can delete files from release",{ ) }) - expect_equal(nrow(pb_info(test_repo, test_release_tag)), 1) + count_end <- nrow(pb_info(test_repo, test_release_tag)) + expect_equal(count_start - 1, count_end) }) test_that("warn if file to delete is not found",{ @@ -249,11 +306,21 @@ test_that("can download private repo file",{ x <- read.csv(file.path(tempdir(),"iris_example.csv")) - # warning(paste(readLines(file.path(tempdir(),"iris_example.csv")), collapse = "\n")) - expect_equal( nrow(x), 150 ) +}) + +test_that("can read private repo files",{ + skippy(TRUE) + + x <- pb_read( + file = "iris_example.csv", + repo = "tanho63/piggyback-private", + tag = "iris", + .token = Sys.getenv("TAN_GH_TOKEN") + ) + expect_equal(nrow(x), 150) }) diff --git a/vignettes/piggyback.Rmd b/vignettes/piggyback.Rmd index 1e8e05a..75cf459 100644 --- a/vignettes/piggyback.Rmd +++ b/vignettes/piggyback.Rmd @@ -9,14 +9,11 @@ vignette: > %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - results="hide", - eval = Sys.getenv("TAN_GH_TOKEN", FALSE) -) - -Sys.setenv(piggyback_cache_duration=0) +knitr::opts_chunk$set(eval = FALSE) +``` +```{r} +library(piggyback) +library(magrittr) ``` ## Why `piggyback`? @@ -36,149 +33,338 @@ number of files or bandwidth to deliver them. ## Authentication No authentication is required to download data from *public* GitHub repositories -using `piggyback`. Nevertheless, `piggyback` recommends setting a token when -possible to avoid rate limits. To upload data to any repository, or to download -data from *private* repositories, you will need to authenticate first. +using `piggyback`. Nevertheless, we recommends setting a token when possible to +avoid rate limits. To upload data to any repository, or to download data from +*private* repositories, you will need to authenticate first. -`piggyback` uses the same GitHub Personal Access Token (PAT) that devtools, usethis, and -friends use (`gh::gh_token()`). The current best practice for managing your GitHub -credentials is detailed in this [usethis vignette](https://usethis.r-lib.org/articles/git-credentials.html). +`piggyback` uses the same GitHub Personal Access Token (PAT) that devtools, +usethis, and friends use (`gh::gh_token()`). The current best practice for +managing your GitHub credentials is detailed in this +[usethis vignette](https://usethis.r-lib.org/articles/git-credentials.html). You can also add the token as an environment variable, which may be useful in -situations where you use piggyback non-interactively (i.e.scheduled/automated scripts). +situations where you use piggyback non-interactively (i.e. automated scripts). Here are the relevant steps: - Create a [GitHub Token](https://github.com/settings/tokens/new?scopes=repo,gist&description=PIGGYBACK_PAT) -- Add the environment variable. You can do this: - - via project-specific Renviron: `usethis::edit_r_environ("project")`. You should - then add the Renviron to your gitignore via `usethis::use_git_ignore(".Renviron")`. - **Avoid committing your GITHUB_PAT to the repository for security reasons!** - - via `Sys.setenv(GITHUB_PAT = "{your token}")` in your console for oneoff usage. - Avoid adding this line to your R scripts -- remember, the goal here is to avoid - writing your private token in any file that might be shared, even privately. - -## Downloading data +- Add the environment variable: + - via project-specific Renviron: + - `usethis::use_git_ignore(".Renviron")` to update your gitignore - this + prevents accidentally committing your token to GitHub + - `usethis::edit_r_environ("project")` to open the Renviron file, and then + add your token, e.g. `GITHUB_PAT=ghp_a1b2c3d4e5f6g7` + - via `Sys.setenv(GITHUB_PAT = "ghp_a1b2c3d4e5f6g7")` in your console for adhoc + usage. Avoid adding this line to your R scripts -- remember, the goal here is + to avoid writing your private token in any file that might be shared, even + privately. + +## Download Files Download a file from a release: -```r -library(piggyback) -pb_download("iris2.tsv.gz", - repo = "cboettig/piggyback-tests", - tag = "v0.0.1", - dest = tempdir()) -``` -``` -ℹ Downloading "iris2.tsv.gz"... - |======================================================| 100% -``` -```r +```{r} +pb_download( + file = "iris2.tsv.gz", + dest = tempdir(), + repo = "cboettig/piggyback-tests", + tag = "v0.0.1" + ) +#> ℹ Downloading "iris2.tsv.gz"... +#> |======================================================| 100% fs::dir_tree(tempdir()) +#> /tmp/RtmpWxJSZj +#> └── iris2.tsv.gz ``` -``` -/tmp/RtmpWxJSZj -└── iris2.tsv.gz -``` - -**Tips:** - -1. Whenever you are working from a location inside a git repository corresponding -to your GitHub repo, you can simply omit the `repo` argument and it will be detected -automatically. -2. Likewise, if you omit the release `tag`, `pb_download` will simply pull data -from most recent release (`latest`). -3. You can omit `tempdir()` if you are using an RStudio Project (`.Rproj` file) -in your repository: download locations will be relative to Project root. -`tempdir()` is used throughout the examples only to meet CRAN policies and is -unlikely to be the choice you actually want here. - -4. Omit the file name to download all assets connected with a given release. -```r -pb_download(repo = "cboettig/piggyback-tests", - tag = "v0.0.1", - dest = tempdir()) -``` -``` -ℹ Downloading "diamonds.tsv.gz"... - |======================================================| 100% -ℹ Downloading "iris.tsv.gz"... - |======================================================| 100% -ℹ Downloading "iris.tsv.xz"... - |======================================================| 100% -``` -```r +Some default behaviors to know about: + +1. The `repo` argument in most piggyback functions will default to detecting the + relevant GitHub repo based on your current working directory's git configs, + so in many cases you can omit the `repo` argument. +2. The `tag` argument in most functions defaults to "latest", which typically + refers to the most recently created release of the repository, unless there + is a release specifically named "latest" or if you have marked a different + release as "latest" via the GitHub UI. +3. The `dest` argument defaults to your current working directory (`"."`). We + use `tempdir()` to meet CRAN policies for the purposes of examples. +4. The `file` argument in `pb_download` defaults to NULL, which will download + all files connected to a given release: +```{r} +pb_download( + repo = "cboettig/piggyback-tests", + tag = "v0.0.1", + dest = tempdir() +) +#> ℹ Downloading "diamonds.tsv.gz"... +#> |======================================================| 100% +#> ℹ Downloading "iris.tsv.gz"... +#> |======================================================| 100% +#> ℹ Downloading "iris.tsv.xz"... +#> |======================================================| 100% fs::dir_tree(tempdir()) +#> /tmp/RtmpWxJSZj +#> ├── diamonds.tsv.gz +#> ├── iris.tsv.gz +#> ├── iris.tsv.xz +#> └── iris2.tsv.gz ``` -``` -/tmp/RtmpWxJSZj -├── diamonds.tsv.gz -├── iris.tsv.gz -├── iris.tsv.xz -└── iris2.tsv.gz +5. The `use_timestamps` argument defaults to TRUE - notice that above, + `iris2.tsv.gz` was not downloaded. If `use_timestamps` is TRUE, pb_download() + will compare the local file timestamp against the GitHub file timestamp, and + only download the file if it has changed. + +`pb_download()` also includes arguments to control the progress bar or if any +particular files should not be downloaded. + +### Download URLs + +Sometimes it is preferable to have a URL from which the data can be read in directly. +These URL can then be passed into another R function, which can be more elegant +and performant than having to first download the files locally. Enter `pb_download_url()`: + +```{r} +pb_download_url(repo = "cboettig/piggyback-tests", tag = "v0.0.1") +#> [1] "https://github.com/cboettig/piggyback-tests/releases/download/v0.0.1/diamonds.tsv.gz" +#> [2] "https://github.com/cboettig/piggyback-tests/releases/download/v0.0.1/iris.tsv.gz" +#> [3] "https://github.com/cboettig/piggyback-tests/releases/download/v0.0.1/iris.tsv.xz" +#> [4] "https://github.com/cboettig/piggyback-tests/releases/download/v0.0.1/iris2.tsv.gz" ``` -These defaults mean that in most cases, it is sufficient to simply call `pb_download()` -without additional arguments to pull in any data associated with a project on a -GitHub repo that is too large to commit to git directly. +By default, this function returns the same download URL that you would get by +visiting the release page, right-clicking on the file, and copying the link (aka +the "browser_download_url"). This URL is served by GitHub's web servers and not +its API servers, and therefore not as restrictive with rate-limiting. + +However, this URL is not accessible for private repositories, since the auth +tokens are handled by the GitHub API. You can retrieve the API download url for +private repositories by passing in `"api"` to the `url_type` argument: +```{r} +pb_download_url(repo = "cboettig/piggyback-tests", tag = "v0.0.1", url_type = "api") +#> [1] https://api.github.com/repos/cboettig/piggyback-tests/releases/assets/44261315 +#> [2] https://api.github.com/repos/cboettig/piggyback-tests/releases/assets/41841778 +#> [3] https://api.github.com/repos/cboettig/piggyback-tests/releases/assets/18538636 +#> [4] https://api.github.com/repos/cboettig/piggyback-tests/releases/assets/8990141 +``` -Notice that above, `iris2.tsv.gz` was not downloaded. `pb_download()` will skip -downloading of any file that already exists locally, if the timestamp on the local copy is more recent than the timestamp on the GitHub copy. Use the `overwrite` parameter to control this behaviour. +`pb_download_url` otherwise shares similar default behaviors with `pb_download` +for the `file`, `repo`, and `tag` arguments. + +## Reading data for R usage + +`piggyback` supports several general patterns for reading data into R, with +increasing degrees of performance/efficiency (and complexity): + +- `pb_download()` files to disk and then reading files with a function that reads +from disk into memory +- `pb_download_url()` a set of URLs and then passing those URLs to a function that +retrieves those URLs directly into memory +- Disk-based workflows which require downloading all files first but then can +perform queries before reading into memory +- Cloud-native workflows which can perform queries directly on the URLs before +reading into memory + +We recommend the latter two approaches in cases where performance and efficiency +matter, and have some vignettes with examples: +- [cloud native workflows](https://docs.ropensci.org/piggyback/articles/cloud_native.html) +- disk native workflows + +### Reading files + +`pb_read()` is a wrapper on the first pattern - it downloads the file to a temp +file, then reads that file into memory, then deletes the temporary file. It +works for both public and private repositories, handling authentication under +the hood: + +```{r} +pb_read("mtcars.rds", repo = "tanho63/piggyback-private") +#> # A data.frame: 32 × 11 +#> mpg cyl disp hp drat wt qsec vs am gear carb +#> +#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 +#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 +#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 +#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 +#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 +#> # ℹ 27 more rows +#> # ℹ 1 more variable: carb +pb_read("mtcars.parquet", repo = "tanho63/piggyback-private") +#> # A data.frame: 32 × 11 +#> mpg cyl disp hp drat wt qsec vs am gear carb +#> +#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 +#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 +#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 +#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 +#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 +#> # ℹ 27 more rows +#> # ℹ 1 more variable: carb +``` -`pb_download()` also includes arguments to control the progress bar or if any particular -files should not be downloaded. +By default, `pb_read` is programmed to use the following `read_function` for the +corresponding file extensions: + +- ".csv", ".csv.gz", ".csv.xz" are read with `utils::read.csv()` +- ".tsv", ".tsv.gz", ".tsv.xz" are read with `utils::read.delim()` +- ".rds" is read with `readRDS()` +- ".json" is read with `jsonlite::fromJSON()` +- ".parquet" is read with `arrow::read_parquet()` +- ".txt" is read with `readLines()` + +If a file extension is not on this list, `pb_read` will raise an error and ask +you to provide a `read_function` - you can also use this parameter to override +the default `read_function` yourself: + +```{r} +pb_read( + file = "play_by_play_2023.qs", + repo = "nflverse/nflverse-data", + tag = "pbp", + read_function = qs::qread +) +#> # A tibble: 42,251 × 372 +#> play_id game_id old_game_id home_team away_team season_type week posteam +#> +#> 1 1 2023_01_ARI_W… 2023091007 WAS ARI REG 1 NA +#> 2 39 2023_01_ARI_W… 2023091007 WAS ARI REG 1 WAS +#> 3 55 2023_01_ARI_W… 2023091007 WAS ARI REG 1 WAS +#> 4 77 2023_01_ARI_W… 2023091007 WAS ARI REG 1 WAS +#> 5 102 2023_01_ARI_W… 2023091007 WAS ARI REG 1 WAS +#> # ℹ 42,246 more rows +#> # ℹ 364 more variables: posteam_type , defteam , side_of_field , +#> # yardline_100 , game_date , quarter_seconds_remaining , +#> # half_seconds_remaining , game_seconds_remaining , game_half , +#> # quarter_end , drive , sp , qtr , down , +#> # goal_to_go , time , yrdln , ydstogo , ydsnet , +#> # desc , play_type , yards_gained , shotgun , … +``` -### Download URLs +Any `read_function` can be provided so long as it accepts the filename as the +first argument, and you can pass any additional parameters via `...`: -Sometimes it is preferable to have a URL from which the data can be read in directly, -rather than downloading the data to a local file. For example, such a URL can be -embedded directly into another R script, avoiding any dependence on `piggyback` -(provided the repository is already public.) To get a list of URLs rather than -actually downloading the files, use `pb_download_url()`: +```{r} +pb_read( + file = "play_by_play_2023.csv", + n_max = 10, + repo = "nflverse/nflverse-data", + tag = "pbp", + read_function = readr::read_csv +) +#> # A tibble: 10 × 372 +#> play_id game_id old_game_id home_team away_team season_type week posteam +#> +#> 1 1 2023_01_ARI_W… 2023091007 WAS ARI REG 1 NA +#> 2 39 2023_01_ARI_W… 2023091007 WAS ARI REG 1 WAS +#> 3 55 2023_01_ARI_W… 2023091007 WAS ARI REG 1 WAS +#> 4 77 2023_01_ARI_W… 2023091007 WAS ARI REG 1 WAS +#> 5 102 2023_01_ARI_W… 2023091007 WAS ARI REG 1 WAS +#> # ℹ 5 more rows +#> # ℹ 364 more variables: posteam_type , defteam , side_of_field , +#> # yardline_100 , game_date , quarter_seconds_remaining , +#> # half_seconds_remaining , game_seconds_remaining , game_half , +#> # quarter_end , drive , sp , qtr , down , +#> # goal_to_go , time , yrdln , ydstogo , ydsnet , +#> # desc , play_type , yards_gained , shotgun , … +``` -```r -pb_download_url(repo = "cboettig/piggyback-tests", - tag = "v0.0.1") +### Reading from URLs + +More efficiently, many read functions accept URLs, including `read.csv()`, +`arrow::read_parquet()`, `readr::read_csv()`, `data.table::fread()`, and +`jsonlite::fromJSON()`, so reading in one file can be done by passing along the +output of `pb_download_url()`: + +```{r} +pb_download_url("mtcars.csv", repo = "tanho63/piggyback-tests", tag = "v0.0.2") %>% + read.csv() +#> # A data.frame: 32 × 12 +#> X mpg cyl disp hp drat wt qsec vs am gear +#> +#> 1 Mazda… 21 6 160 110 3.9 2.62 16.5 0 1 4 +#> 2 Mazda… 21 6 160 110 3.9 2.88 17.0 0 1 4 +#> 3 Datsu… 22.8 4 108 93 3.85 2.32 18.6 1 1 4 +#> 4 Horne… 21.4 6 258 110 3.08 3.22 19.4 1 0 3 +#> 5 Horne… 18.7 8 360 175 3.15 3.44 17.0 0 0 3 +#> # ℹ 27 more rows +#> # ℹ 1 more variable: carb +#> # ℹ Use `print(n = ...)` to see more rows ``` + +Some functions also accept URLs when converted into a connection by wrapping it +in `url()`, e.g. for `readRDS()`: +```{r} +pb_url <- pb_download_url("mtcars.rds", repo = "tanho63/piggyback-tests", tag = "v0.0.2") %>% + url() +readRDS(pb_url) +#> # A data.frame: 32 × 11 +#> mpg cyl disp hp drat wt qsec vs am gear carb +#> +#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 +#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 +#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 +#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 +#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 +#> # ℹ 27 more rows +#> # ℹ Use `print(n = ...)` to see more rows +close(pb_url) ``` -[1] "https://github.com/cboettig/piggyback-tests/releases/download/v0.0.1/diamonds.tsv.gz" -[2] "https://github.com/cboettig/piggyback-tests/releases/download/v0.0.1/iris.tsv.gz" -[3] "https://github.com/cboettig/piggyback-tests/releases/download/v0.0.1/iris.tsv.xz" -[4] "https://github.com/cboettig/piggyback-tests/releases/download/v0.0.1/iris2.tsv.gz" +Note that using `url()` requires that we close the connection after reading it, +or else we will receive warnings about leaving open connections. + +This `url()` approach allows us to pass along authentication for private repos, +e.g. +```{r} +pb_url <- pb_download_url("mtcars.rds", repo = "tanho63/piggyback-private", url_type = "api") %>% + url( + headers = c( + "Accept" = "application/octet-stream", + "Authorization" = paste("Bearer", gh::gh_token()) + ) + ) +readRDS(pb_url) +#> # A tibble: 32 × 11 +#> mpg cyl disp hp drat wt qsec vs am gear carb +#> +#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 +#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 +#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 +#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 +#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 +#> # ℹ 27 more rows +#> # ℹ Use `print(n = ...)` to see more rows +close(pb_url) ``` -## Uploading data +Note that `arrow` does not accept a `url()` connection at this time, so you should +default to `pb_read()` if using private repositories. + -If your GitHub repository doesn't have any -[releases](https://docs.github.com/en/github/administering-a-repository/managing-releases-in-a-repository) -yet, `piggyback` will help you quickly create one. Create new releases to manage -multiple versions of a given data file, or to organize sets of files. +## Uploading data -While you can create releases as often as you like, making a new release is not -necessary each time you upload a file. If maintaining old versions of the data -is not useful, you can stick with a single release and upload all of your data -there. +`piggyback` uploads data to GitHub releases. If your repository doesn't have a +release yet, `piggyback` will prompt you to create one - you can create a release +with: -```r +```{r} pb_release_create(repo = "cboettig/piggyback-tests", tag = "v0.0.2") +#> ✔ Created new release "v0.0.2". ``` -``` -✔ Created new release "v0.0.2". -``` -Once we have at least one release available, we are ready to upload. By default, -`pb_upload` will attach data to the latest release. +Create new releases to manage multiple versions of a given data file, or to +organize sets of files under a common topic. While you can create releases as +often as you like, making a new release is not necessary each time you upload a +file. If maintaining old versions of the data is not useful, you can stick with +a single release and upload all of your data there. + +Once we have at least one release available, we are ready to upload files. By +default, `pb_upload` will attach data to the latest release. -```r +```{r} ## We'll need some example data first. ## Pro tip: compress your tabular data to save space & speed upload/downloads readr::write_tsv(mtcars, "mtcars.tsv.gz") -pb_upload("mtcars.tsv.gz", - repo = "cboettig/piggyback-tests") -``` -``` -ℹ Uploading to latest release: "v0.0.2". -ℹ Uploading mtcars.tsv.gz ... - |===================================================| 100% +pb_upload("mtcars.tsv.gz", repo = "cboettig/piggyback-tests") +#> ℹ Uploading to latest release: "v0.0.2". +#> ℹ Uploading mtcars.tsv.gz ... +#> |===================================================| 100% ``` Like `pb_download()`, `pb_upload()` will overwrite any file of the same name already @@ -186,13 +372,9 @@ attached to the release file by default, unless the timestamp of the previously uploaded version is more recent. You can toggle these settings with the `overwrite` parameter. -### Multiple files - -You can pass in a vector of file paths with something like `list.files()` to the `file` argument of `pb_upload()` in order to upload multiple files. Some common patterns: - -```r +`pb_upload` also accepts a vector of multiple files to upload: +```{r} library(magrittr) - ## upload a folder of data list.files("data") %>% pb_upload(repo = "cboettig/piggyback-tests", tag = "v0.0.1") @@ -200,22 +382,51 @@ list.files("data") %>% ## upload certain file extensions list.files(pattern = c("*.tsv.gz", "*.tif", "*.zip")) %>% pb_upload(repo = "cboettig/piggyback-tests", tag = "v0.0.1") +``` + +### Write R object directly to release + +`pb_write` wraps the above process, essentially allowing you to upload directly +to a release by providing an object, filename, and repo/tag: + +```{r} +pb_write(mtcars, "mtcars.rds", repo = "cboettig/piggyback-tests") +#> ℹ Uploading to latest release: "v0.0.2". +#> ℹ Uploading mtcars.rds ... +#> |===================================================| 100% +``` +Similar to `pb_read`, `pb_write` has some pre-programmed `write_functions` for +the following file extensions: +- ".csv", ".csv.gz", ".csv.xz" are written with `utils::write.csv()` +- ".tsv", ".tsv.gz", ".tsv.xz" are written with `utils::write.csv(x, filename, sep = '\t')` +- ".rds" is written with `saveRDS()` +- ".json" is written with `jsonlite::write_json()` +- ".parquet" is written with `arrow::write_parquet()` +- ".txt" is written with `writeLines()` + +and you can pass custom functions with the `write_function` parameter: +```{r} +pb_write( + x = mtcars, + file = "mtcars.csv.gz", + repo = "cboettig/piggyback-tests", + write_function = data.table::fwrite +) +#> ℹ Uploading to latest release: "v0.0.2". +#> ℹ Uploading mtcars.csv.gz ... +#> |===================================================| 100% ``` -Similarly, you can download all current data assets of the latest or specified -release by using `pb_download()` with no arguments. ## Deleting Files Delete a file from a release: -```r +```{r} pb_delete(file = "mtcars.tsv.gz", repo = "cboettig/piggyback-tests", tag = "v0.0.1") -``` -``` -ℹ Deleted "mtcars.tsv.gz" from "v0.0.1" release on "cboettig/piggyback-tests" +#> ℹ Deleted "mtcars.tsv.gz" from "v0.0.1" release on "cboettig/piggyback-tests" ``` Note that this is irreversible unless you have a copy of the data elsewhere. @@ -224,16 +435,13 @@ Note that this is irreversible unless you have a copy of the data elsewhere. List all files currently piggybacking on a given release. Omit `tag` to see files on all releases. -```r -pb_list(repo = "cboettig/piggyback-tests", - tag = "v0.0.1") -``` -``` - file_name size timestamp tag owner repo -1 diamonds.tsv.gz 571664 2021-09-07 23:38:31 v0.0.1 cboettig piggyback-tests -2 iris.tsv.gz 846 2021-08-05 20:00:09 v0.0.1 cboettig piggyback-tests -3 iris.tsv.xz 848 2020-03-07 06:18:32 v0.0.1 cboettig piggyback-tests -4 iris2.tsv.gz 846 2018-10-05 17:04:33 v0.0.1 cboettig piggyback-tests +```{r} +pb_list(repo = "cboettig/piggyback-tests", tag = "v0.0.1") +#> file_name size timestamp tag owner repo +#> 1 diamonds.tsv.gz 571664 2021-09-07 23:38:31 v0.0.1 cboettig piggyback-tests +#> 2 iris.tsv.gz 846 2021-08-05 20:00:09 v0.0.1 cboettig piggyback-tests +#> 3 iris.tsv.xz 848 2020-03-07 06:18:32 v0.0.1 cboettig piggyback-tests +#> 4 iris2.tsv.gz 846 2018-10-05 17:04:33 v0.0.1 cboettig piggyback-tests ``` ## Caching @@ -242,17 +450,17 @@ To reduce GitHub API calls, piggyback caches `pb_releases` and `pb_list` with a timeout of 10 minutes by default. This avoids repeating identical requests to update its internal record of the repository data (releases, assets, timestamps, etc) during programmatic use. You can increase or decrease this delay by setting the -environment variable in seconds, e.g. `Sys.setenv("piggyback_cache_duration" = 10)` -for a longer delay or `Sys.setenv("piggyback_cache_duration" = 0)` to disable caching, +environment variable in seconds, e.g. `Sys.setenv("piggyback_cache_duration" = 3600)` +for a longer cache or `Sys.setenv("piggyback_cache_duration" = 0)` to disable caching, and then restarting R. ## Valid file names -GitHub assets attached to a release do not support file paths, and will convert -most special characters (`#`, `%`, etc) to `.` or throw an error (e.g. for file -names containing `$`, `@`, `/`). `piggyback` will default to using the base name of -the file only (i.e. will only use `"mtcars.csv"` if provided a file path like -`"data/mtcars.csv"`) +GitHub assets attached to a release do not support file paths, and will sometimes +convert most special characters (`#`, `%`, etc) to `.` or throw an error (e.g. +for file names containing `$`, `@`, `/`). `piggyback` will default to using the +`basename()` of the file only (i.e. will only use `"mtcars.csv"` if provided a +file path like `"data/mtcars.csv"`) ## A Note on GitHub Releases vs Data Archiving