diff --git a/.gitignore b/.gitignore index 4a03414..bf695ed 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ data/* .DS_Store CRAN_SUBMISSION CRAN_RELEASE +docs/* diff --git a/DESCRIPTION b/DESCRIPTION index edbb42f..8642924 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,16 +1,10 @@ Package: piggyback Version: 0.1.5.9004 Title: Managing Larger Data on a GitHub Repository -Description: Because larger (> 50 MB) data files cannot easily be committed to git, - a different approach is required to manage data associated with an analysis in a - GitHub repository. This package provides a simple work-around by allowing larger - (up to 2 GB) data files to piggyback on a repository as assets attached to individual - GitHub releases. These files are not handled by git in any way, but instead are - uploaded, downloaded, or edited directly by calls through the GitHub API. These - data files can be versioned manually by creating different releases. This approach - works equally well with public or private repositories. Data can be uploaded - and downloaded programmatically from scripts. No authentication is required to - download data from public repositories. +Description: Helps store files as GitHub release assets, which is a convenient + way for large/binary data files to piggyback onto public and private GitHub + repositories. Includes functions for file downloads, uploads, and managing + releases via the GitHub API. Authors@R: c(person("Carl", "Boettiger", email = "cboettig@gmail.com", role = c("aut", "cre", "cph"), @@ -43,9 +37,8 @@ Imports: memoise, rlang Suggests: + arrow, spelling, - duckdbfs, - duckdb, readr, covr, testthat, diff --git a/NAMESPACE b/NAMESPACE index ddd6b6d..3d2133d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,9 @@ export(pb_download) export(pb_download_url) export(pb_list) export(pb_new_release) +export(pb_read) export(pb_release_create) export(pb_release_delete) export(pb_releases) export(pb_upload) +export(pb_write) diff --git a/NEWS.md b/NEWS.md index 4761470..1958515 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ before trying API download URLs. This should reduce/eliminate effect of API rate limits for pb_download. [#109] * `"latest"` release now aligns with GitHub's "latest" release definition [#113] * `pb_download_url()` now can return choice of "browser" or "api" download URLs [#116] +* Add new functions `pb_read()` and `pb_write()` as convenience wrappers around +pattern of downloading to `tempfile()` and then reading into memory. [#97] # piggyback 0.1.5 diff --git a/R/pb_download_url.R b/R/pb_download_url.R index 79dce69..43a646b 100644 --- a/R/pb_download_url.R +++ b/R/pb_download_url.R @@ -70,7 +70,7 @@ pb_download_url <- function(file = NULL, file <- file[file %in% df$file_name] } - if(length(file) == 0) return(cli::cli_abort("No download URLs to return.")) + if(length(file) == 0) return(cli::cli_abort("No download URLs found")) switch( url_type, diff --git a/R/pb_read.R b/R/pb_read.R new file mode 100644 index 0000000..79e6baf --- /dev/null +++ b/R/pb_read.R @@ -0,0 +1,90 @@ +#' Read one file into memory +#' +#' A convenience wrapper around writing an object to a temporary file and then +#' uploading to a specified repo/release. This convenience comes at a cost to +#' performance efficiency, since it first downloads the data to disk and then +#' reads the data from disk into memory. See `vignette("cloud_native")` for +#' alternative ways to bypass this flow and work with the data directly. +#' +#' @param file string: file name +#' @param repo string: GH repository name in format "owner/repo". Default +#' `guess_repo()` tries to guess based on current working directory's git repo +#' @param tag string: tag for the GH release, defaults to "latest" +#' @param read_function function: used to read in the data, where the file is +#' passed as the first argument and any additional arguments are subsequently +#' passed in via `...`. Default `guess_read_function(file)` will check the file +#' extension and try to find an appropriate read function if the extension is one +#' of rds, csv, tsv, parquet, txt, or json, and will abort if not found. +#' @param ... additional arguments passed to `read_function` after file +#' @param .token GitHub authentication token, see [gh::gh_token()] +#' +#' @export +#' @family pb_rw +#' +#' @return Result of reading in the file in question. +#' @examples \donttest{ +#' try({ # try block is to avoid CRAN issues and is not required in ordinary usage +#' piggyback::pb_read("mtcars.tsv.gz", repo = "cboettig/piggyback-tests") +#' }) +#' } +pb_read <- function(file, + ..., + repo = guess_repo(), + tag = "latest", + read_function = guess_read_function(file), + .token = gh::gh_token()) { + stopifnot( + is.character(file) && length(file) == 1, + is.character(repo) && length(repo) == 1, + is.character(tag) && length(tag) == 1, + rlang::is_function(read_function) + ) + + on.exit(unlink(file.path(tempdir(), file))) + + pb_download( + file = file, + dest = tempdir(check = TRUE), + repo = repo, + tag = tag, + overwrite = TRUE, + .token = .token + ) + + read_function(file.path(tempdir(), file), ...) +} + +#' Guess read function from file extension +#' +#' This function accepts a filename and tries to return a valid function for +#' reading it. +#' +#' `guess_read_function` understands the following file extensions: +#' - rds with `readRDS` +#' - csv, csv.gz, csv.xz with `utils::read.csv` +#' - tsv, tsv.gz, tsv.xz with `utils::read.delim` +#' - parquet with `arrow::read_parquet` +#' - txt, txt.gz, txt.xz with `readLines` +#' - json, json.gz, json.xz with `jsonlite::fromJSON` +#' +#' @family pb_rw +#' @param file filename to parse +#' @return function for reading the file, if found +#' @keywords internal +guess_read_function <- function(file){ + file_ext <- tools::file_ext(gsub(x = file, pattern = ".gz$|.xz$", replacement = "")) + if (file_ext == "parquet") rlang::check_installed("arrow") + + read_fn <- switch( + file_ext, + "rds" = readRDS, + "csv" = utils::read.csv, + "tsv" = utils::read.delim, + "parquet" = arrow::read_parquet, + "txt" = readLines, + "json" = jsonlite::fromJSON, + cli::cli_abort("File type {.val {file_ext}} is not recognized, please provide a {.arg read_function}") + ) + + return(read_fn) +} diff --git a/R/pb_write.R b/R/pb_write.R new file mode 100644 index 0000000..10f4b1a --- /dev/null +++ b/R/pb_write.R @@ -0,0 +1,84 @@ +#' Write one object to repo/release +#' +#' A convenience wrapper around writing an object to a temporary file and then +#' uploading to a specified repo/release. +#' +#' @param x object: memory object to save to piggyback +#' @param file string: file name +#' @param ... additional arguments passed to `write_function` +#' @param repo string: GH repository name in format "owner/repo". Default +#' `guess_repo()` tries to guess based on current working directory's git repo +#' @param tag string: tag for the GH release, defaults to "latest" +#' @param write_function function: used to write an R object to file, where the +#' object is passed as the first argument, the filename as the second argument, +#' and any additional arguments are subsequently passed in via `...`. Default +#' `guess_write_function(file)` will check the file extension and try to find an +#' appropriate write function if the extension is one of rds, csv, tsv, parquet, +#' txt, or json, and will abort if not found. +#' @param .token GitHub authentication token, see [gh::gh_token()] +#' +#' @export +#' @family pb_rw +#' +#' @return Writes file to release and returns github API response +#' @examples \donttest{ +#' \dontshow{if (interactive()) \{} +#' pb_write(mtcars, "mtcars.rds", repo = "tanho63/piggyback-tests") +#' #> ℹ Uploading to latest release: "v0.0.2". +#' #> ℹ Uploading mtcars.rds ... +#' #> |===============================================================| 100% +#' \dontshow{\}} +#'} +pb_write <- function(x, + file, + ..., + repo = guess_repo(), + tag = "latest", + write_function = guess_write_function(file), + .token = gh::gh_token()) { + stopifnot( + is.character(file) && length(file) == 1, + is.character(repo) && length(repo) == 1, + is.character(tag) && length(tag) == 1, + rlang::is_function(write_function) + ) + destfile <- file.path(tempdir(check = TRUE), file) + on.exit(try(unlink(destfile))) + write_function(x, destfile, ...) + pb_upload(destfile, repo = repo, tag = tag, .token = .token) +} + +#' Guess write function from file extension +#' +#' This function accepts a filename and tries to return a valid function for +#' writing to it. +#' +#' `guess_write_function` understands the following file extensions: +#' - rds with `saveRDS` +#' - csv, csv.gz, csv.xz with `utils::write.csv` +#' - tsv, tsv.gz, tsv.xz with a modified `utils::write.csv` where sep is set to `"\t"` +#' - parquet with `arrow::write_parquet` +#' - txt, txt.gz, txt.xz with `writeLines` +#' - json, json.gz, json.xz with `jsonlite::write_json` +#' +#' @family pb_rw +#' @param file filename to parse +#' @return function for reading the file, if found +#' @keywords internal +guess_write_function <- function(file){ + file_ext <- tools::file_ext(gsub(x = file, pattern = ".gz$|.xz$", replacement = "")) + if (file_ext == "parquet") rlang::check_installed("arrow") + + write_fn <- switch( + file_ext, + "rds" = saveRDS, + "csv" = utils::write.csv, + "tsv" = function(x, file, ..., sep = "\t") utils::write.csv(x = x, file = file, sep = sep, ...), + "txt" = writeLines, + "parquet" = arrow::write_parquet, + "json" = jsonlite::write_json, + cli::cli_abort("File type {.val {file_ext}} is not recognized, please provide a {.arg write_function}") + ) + + return(write_fn) +} diff --git a/README.Rmd b/README.Rmd index d9dce35..1660ae9 100644 --- a/README.Rmd +++ b/README.Rmd @@ -7,7 +7,7 @@ output: github_document ```{r setup, include = FALSE} knitr::opts_chunk$set( - eval = TRUE, + eval = FALSE, collapse = TRUE, message = FALSE, comment = "#>", @@ -29,7 +29,7 @@ knitr::opts_chunk$set( [![DOI](http://joss.theoj.org/papers/10.21105/joss.00971/status.svg)](https://doi.org/10.21105/joss.00971) -`{piggyback}` provides an R interface for storing files as GitHub release assets, +`piggyback` provides an R interface for storing files as GitHub release assets, which is a convenient way for large/binary data files to _piggyback_ onto public and private GitHub repositories. This package includes functions for file downloads, uploads, and managing releases, which then are passed to the GitHub API. @@ -38,12 +38,12 @@ No authentication is required to download data from public repositories. ## Installation Install from CRAN via: -```r +```{r} install.packages("piggyback") ``` You can install the development version from [GitHub](https://github.com/ropensci/piggyback) with either r-universe or with remotes: -```r +```{r} install.packages("piggyback", repos = c('https://ropensci.r-universe.dev', getOption("repos"))) # install.packages("remotes") remotes::install_github("ropensci/piggyback") @@ -53,7 +53,7 @@ See [getting started vignette](https://docs.ropensci.org/piggyback/articles/intr for a more comprehensive introduction. Download data attached to a GitHub release: -```r +```{r} library(piggyback) pb_download("iris2.tsv.gz", repo = "cboettig/piggyback-tests", @@ -72,12 +72,12 @@ or a GITHUB_PAT environment variable - for more information, see the vignette no [authentication](https://docs.ropensci.org/piggyback/articles/piggyback.html#authentication). We can also upload data to a release. Start by creating a release: -```r +```{r} pb_release_create(repo = "cboettig/piggyback-tests", tag = "v0.0.2") #> ✔ Created new release "v0.0.2". ``` then upload to it: -```r +```{r} readr::write_tsv(mtcars, "mtcars.tsv.gz") pb_upload("mtcars.tsv.gz", repo = "cboettig/piggyback-tests") #> ℹ Uploading to latest release: "v0.0.2". @@ -117,8 +117,7 @@ Please note that this project is released with a [Contributor Code of Conduct](https://ropensci.org/code-of-conduct/). By participating in this project you agree to abide by its terms. -```{r include=FALSE} -unlink("*.gz") +```{r eval=TRUE, include=FALSE} codemeta::write_codemeta() ``` diff --git a/README.md b/README.md index b7ccec5..38ced78 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ Status](https://badges.ropensci.org/220_status.svg)](https://github.com/ropensci [![DOI](http://joss.theoj.org/papers/10.21105/joss.00971/status.svg)](https://doi.org/10.21105/joss.00971) -`{piggyback}` provides an R interface for storing files as GitHub -release assets, which is a convenient way for large/binary data files to +`piggyback` provides an R interface for storing files as GitHub release +assets, which is a convenient way for large/binary data files to *piggyback* onto public and private GitHub repositories. This package includes functions for file downloads, uploads, and managing releases, which then are passed to the GitHub API. diff --git a/codemeta.json b/codemeta.json index af1aeb7..80b2449 100644 --- a/codemeta.json +++ b/codemeta.json @@ -2,12 +2,12 @@ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "piggyback", - "description": "Because larger (> 50 MB) data files cannot easily be committed to git, a different approach is required to manage data associated with an analysis in a GitHub repository. This package provides a simple work-around by allowing larger (up to 2 GB) data files to piggyback on a repository as assets attached to individual GitHub releases. These files are not handled by git in any way, but instead are uploaded, downloaded, or edited directly by calls through the GitHub API. These data files can be versioned manually by creating different releases. This approach works equally well with public or private repositories. Data can be uploaded and downloaded programmatically from scripts. No authentication is required to download data from public repositories.", + "description": "Helps store files as GitHub release assets, which is a convenient way for large/binary data files to piggyback onto public and private GitHub repositories. Includes functions for file downloads, uploads, and managing releases via the GitHub API.", "name": "piggyback: Managing Larger Data on a GitHub Repository", "codeRepository": "https://github.com/ropensci/piggyback", "issueTracker": "https://github.com/ropensci/piggyback/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.1.5.9003", + "version": "0.1.5.9004", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", @@ -74,6 +74,18 @@ } ], "softwareSuggestions": [ + { + "@type": "SoftwareApplication", + "identifier": "arrow", + "name": "arrow", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=arrow" + }, { "@type": "SoftwareApplication", "identifier": "spelling", @@ -282,5 +294,5 @@ }, "SystemRequirements": null }, - "fileSize": "380.757KB" + "fileSize": "397.705KB" } diff --git a/docs/404.html b/docs/404.html deleted file mode 100644 index 076b8d3..0000000 --- a/docs/404.html +++ /dev/null @@ -1,125 +0,0 @@ - - -
- - - - -As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
-We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
-Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct.
-Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team.
-Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
-This Code of Conduct is adapted from the Contributor Covenant (http://contributor-covenant.org), version 1.0.0, available at http://contributor-covenant.org/version/1/0/0/
-Version 3, 29 June 2007
Copyright © 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
-The GNU General Public License is a free, copyleft license for software and other kinds of works.
-The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program–to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too.
-When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things.
-To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others.
-For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights.
-Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it.
-For the developers’ and authors’ protection, the GPL clearly explains that there is no warranty for this free software. For both users’ and authors’ sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions.
-Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users’ freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users.
-Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free.
-The precise terms and conditions for copying, distribution and modification follow.
-“This License” refers to version 3 of the GNU General Public License.
-“Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.
-“The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations.
-To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work.
-A “covered work” means either the unmodified Program or a work based on the Program.
-To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well.
-To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying.
-An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion.
-The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work.
-A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.
-The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.
-The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work’s System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work.
-The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source.
-The Corresponding Source for a work in source code form is that same work.
-All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.
-You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.
-Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
-No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.
-When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work’s users, your or third parties’ legal rights to forbid circumvention of technological measures.
-You may convey verbatim copies of the Program’s source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.
-You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.
-You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:
-A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation’s users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.
-You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:
-A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work.
-A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product.
-“Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made.
-If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM).
-The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network.
-Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying.
-“Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.
-When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.
-Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:
-All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying.
-If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms.
-Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way.
-You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).
-However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.
-Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.
-Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.
-You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.
-Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License.
-An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party’s predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.
-You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.
-A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor’s “contributor version”.
-A contributor’s “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.
-Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor’s essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.
-In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party.
-If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient’s use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid.
-If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it.
-A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007.
-Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law.
-If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.
-Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such.
-The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
-Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation.
-If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy’s public statement of acceptance of a version permanently authorizes you to choose that version for the Program.
-Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.
-THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.
-END OF TERMS AND CONDITIONS
-If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms.
-To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found.
-<one line to give the program's name and a brief idea of what it does.>
-Copyright (C) 2018 Carl Boettiger
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
-If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode:
-Copyright (C) 2018 Carl Boettiger
- piggyback for details type 'show w'.
- This program comes with ABSOLUTELY NO WARRANTY;
- This is free software, and you are welcome to redistribute it'show c' for details. under certain conditions; type
The hypothetical commands show w
and show c
should show the appropriate parts of the General Public License. Of course, your program’s commands might be different; for a GUI interface, you would use an “about box”.
You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see <http://www.gnu.org/licenses/>.
-The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read <http://www.gnu.org/philosophy/why-not-lgpl.html>.
-vignettes/alternatives.Rmd
- alternatives.Rmd
piggyback
vs the alternatives
-There are many alternatives to piggyback
, and after considerable experience I haven’t found any that ticked all the boxes for me:
Git LFS provides the closest user experience to what I was going for. It stands out above all other alternatives for providing both the best authentication experience (relying directly on any of the standard git
authentication mechanisms such as https, ssh keys, app integration), and it provides the most legitimate version control of the data. However, there are many show-stoppers to using Git LFS for me.
GitHub pricing & resulting problems for GitHub’s fork / PR model. Described eloquently here. Basically, despite generous rates and free data options everywhere else, GitHub’s LFS storage and bandwidth not only cost a lot, but also make it impossible to have public forks and pull request for your repository. Technically this is a problem only for GitHub’s LFS (since it stems from the pricing rules); and can be avoided by using LFS on GitLab or other platform, as Jim Hester has described. Still, this proved unsuccessful for me, and still faces the other big issue with git-lfs
:
Overwrites git
itself. Git LFS is just too integrated into git
– it replaces your authentic git
engine with git-lfs
, such that the identical git
command can have different behaviors on a machine with git-lfs
installed vs just plain git
. Maybe fine for a professional team that is “all in” on git-lfs
, but is a constant source of pitfalls when working with students and moving between machines that all have only authentic git
installed. The difficulties with supporting pull requests etc are also related to this – in some sense, once you have a git-lfs
repository, you’re really using an entirely new version control system that isn’t going to be 100% compatible with the nearly-ubiquitous authentic git
.
Amazon S3 is perhaps the most universal and most obvious go-to place for online-available public and private data storage. The 5 GB/mo free tier is nice and the pricing is very reasonable and only very incremental after that. It is easily the most industry-standard solution, and still probably the best way to go in many cases. It is probably the most scalable solution for very large data, and the only such that has built in support/integration to larger query services like Apache Spark / sparklyr
. It falls short of my own use case though in the authentication area. I require students create a GitHub account for my courses and my lab group. I don’t like requiring such third-party accounts, but this one is fundamental to our daily use in classroom and in research, and most of them will continue using the service afterwards. I particularly don’t like having people create complex accounts that they might not even use much in the class or afterwards, just to deal with some pesky minor issue of some data file that is just a little too big for GitHub.
Amazon’s authentication is also much more complex than GitHub’s passwords or tokens, as is the process of uploading and downloading data from S3 (though the aws.s3
R package is rather nice remedy here, it doesn’t conform to the same user API as the aws-cli
(python) tool, leaving some odd quirks and patterns that don’t match standard Linux commands.) Together, these make it significantly more difficult to deploy as a quick solution for moving private data around with private repositories.
For scientific research purposes, this would be my ideal solution. Encouraging researchers to submit data to a repository at the time of publication is always a challenge, since doing so inevitably involves time & effort and the immediate benefit to the researcher is relatively minimal. If uploading the data to a repository served an immediate practical purpose of facilitating collaboration, backing up and possibly versioning data, etc, during the research process itself rather than after all is said and done, it would be much more compelling. Several repositories permit sharing of private data, at least up to some threshold, including DataONE and figshare. Unfortunately, at this time, I have found the interfaces and R tooling for these too limited or cumbersome for everyday use.
-The piggyback
approach is partly inspired by the strategy used in the datastorr
package, which also uploads data to GitHub releases. datastorr
envisions a rather different workflow around this storage strategy, based on the concept of an R “data package” rather than the Git LFS. I am not a fan of the “data package” approach in general – I think data should be stored in a platform agnostic way, not as .Rdata
files, and I often want to first download my data to disk and read it with dedicated functions, not load it “auto-magically” as a package. This latter issue is particularly important when the data files are larger than what can conveniently fit into working memory, and is better accessed as a database (e.g. SQLite for tabular data, postgis spatial data, etc).
In terms of practical implementation, datastorr
also creates a new release every time the data file is updated, rather than letting you overwrite files. In principle piggyback
will let you version data this way as well, simply create a new release first using pb_new_release(tag="v2")
or whatever tag you like. I have not opted for this workflow since in reality, versioning data with releases this way is technically equivalent to creating a new folder for each new version of the data and storing that – unlike true git commits, release assets such as datastorr
creates can be easily deleted or overwritten. I still believe permanent versioned archives like Zenodo should be used for long-term versioned distribution. Meanwhile, for day-to-day use I often want to overwrite data files with their most recent versions. (In my case these ‘data’ files are most often created from upstream data and/or other possibly-long-running code, and are tracked for convenience. As such they often change as a result of continued work on the upstream processing code. Perhaps this is not the case for many users and more attention should be paid to versioning.)
Another creative solution (hack), at least for some file types, is to break large files into multiple smaller files, and commit those to one or many GitHub repositories. While sharding is sometimes a legitimate strategy, it has many obvious practical disadvantages and limitations.
-vignettes/intro.Rmd
- intro.Rmd
piggyback
?
-piggyback
grew out of the needs of students both in my classroom and in my research group, who frequently need to work with data files somewhat larger than one can conveniently manage by committing directly to GitHub. As we frequently want to share and run code that depends on >50MB data files on each of our own machines, on continuous integration, and on larger computational servers, data sharing quickly becomes a bottleneck.
GitHub allows repositories to attach files of up to 2 GB each to releases as a way to distribute large files associated with the project source code. There is no limit on the number of files or bandwidth to deliver them.
-Install the latest release from CRAN using:
-
-install.packages("piggyback")
You can install the development version from GitHub with:
-
-# install.packages("devtools")
-devtools::install_github("ropensci/piggyback")
No authentication is required to download data from public GitHub repositories using piggyback
. Nevertheless, piggyback
recommends setting a token when possible to avoid rate limits. To upload data to any repository, or to download data from private repositories, you will need to authenticate first.
To do so, add your GitHub Token to an environmental variable, e.g. in a .Renviron
file in your home directory or project directory (any private place you won’t upload), see usethis::edit_r_environ()
. For one-off use you can also set your token from the R console using:
-Sys.setenv(GITHUB_PAT="xxxxxx")
But try to avoid putting Sys.setenv()
in any R scripts – remember, the goal here is to avoid writing your private token in any file that might be shared, even privately.
For more information, please see the usethis guide to GitHub credentials
-Download the latest version or a specific version of the data:
- -
-pb_download("iris2.tsv.gz",
- repo = "cboettig/piggyback-tests",
- tag = "v0.0.1",
- dest = tempdir())
Note: Whenever you are working from a location inside a git repository corresponding to your GitHub repo, you can simply omit the repo
argument and it will be detected automatically. Likewise, if you omit the release tag
, the
pb_downloadwill simply pull data from most recent release (
latest). Third, you can omit
tempdir()if you are using an RStudio Project (
.Rprojfile) in your repository, and then the download location will be relative to Project root.
tempdir()` is used throughout the examples only to meet CRAN policies and is unlikely to be the choice you actually want here.
Lastly, simply omit the file name to download all assets connected with a given release.
-
-pb_download(repo = "cboettig/piggyback-tests",
- tag = "v0.0.1",
- dest = tempdir())
These defaults mean that in most cases, it is sufficient to simply call pb_download()
without additional arguments to pull in any data associated with a project on a GitHub repo that is too large to commit to git directly.
pb_download()
will skip the download of any file that already exists locally if the timestamp on the local copy is more recent than the timestamp on the GitHub copy. pb_download()
also includes arguments to control the timestamp behavior, progress bar, whether existing files should be overwritten, or if any particular files should not be downloaded. See function documentation for details.
Sometimes it is preferable to have a URL from which the data can be read in directly, rather than downloading the data to a local file. For example, such a URL can be embedded directly into another R script, avoiding any dependence on piggyback
(provided the repository is already public.) To get a list of URLs rather than actually downloading the files, use pb_download_url()
:
-pb_download_url("data/mtcars.tsv.gz",
- repo = "cboettig/piggyback-tests",
- tag = "v0.0.1")
If your GitHub repository doesn’t have any releases yet, piggyback
will help you quickly create one. Create new releases to manage multiple versions of a given data file. While you can create releases as often as you like, making a new release is by no means necessary each time you upload a file. If maintaining old versions of the data is not useful, you can stick with a single release and upload all of your data there.
-pb_new_release("cboettig/piggyback-tests", "v0.0.2")
Once we have at least one release available, we are ready to upload. By default, pb_upload
will attach data to the latest release.
-## We'll need some example data first.
-## Pro tip: compress your tabular data to save space & speed upload/downloads
-readr::write_tsv(mtcars, "mtcars.tsv.gz")
-
-pb_upload("mtcars.tsv.gz",
- repo = "cboettig/piggyback-tests",
- tag = "v0.0.1")
Like pb_download()
, pb_upload()
will overwrite any file of the same name already attached to the release file by default, unless the timestamp the previously uploaded version is more recent. You can toggle these settings with overwrite=FALSE
and use_timestamps=FALSE
.
List all files currently piggybacking on a given release. Omit the tag
to see files on all releases.
-pb_list(repo = "cboettig/piggyback-tests",
- tag = "v0.0.1")
Delete a file from a release:
-
-pb_delete(file = "mtcars.tsv.gz",
- repo = "cboettig/piggyback-tests",
- tag = "v0.0.1")
Note that this is irreversible unless you have a copy of the data elsewhere.
-You can pass in a vector of file paths with something like list.files()
to the file
argument of pb_upload()
in order to upload multiple files. Some common patterns:
-library(magrittr)
-
-## upload a folder of data
-list.files("data") %>%
- pb_upload(repo = "cboettig/piggyback-tests", tag = "v0.0.1")
-
-## upload certain file extensions
-list.files(pattern = c("*.tsv.gz", "*.tif", "*.zip")) %>%
- pb_upload(repo = "cboettig/piggyback-tests", tag = "v0.0.1")
Similarly, you can download all current data assets of the latest or specified release by using pb_download()
with no arguments.
To reduce API calls to GitHub, piggyback caches most calls with a timeout of 1 second by default. This avoids repeating identical requests to update it’s internal record of the repository data (releases, assets, timestamps, etc) during programmatic use. You can increase or decrease this delay by setting the environmental variable in seconds, e.g. Sys.setenv("piggyback_cache_duration"=10)
for a longer delay or Sys.setenv("piggyback_cache_duration"=0)
to disable caching.
GitHub assets attached to a release do not support file paths, and will convert most special characters (#
, %
, etc) to .
or throw an error (e.g. for file names containing $
, @
, /
). piggyback will default to using the base name of the file only (i.e. will only use "mtcars.csv"
if provided a file path like "data/mtcars.csv"
)
piggyback
is not intended as a data archiving solution. Importantly, bear in mind that there is nothing special about multiple “versions” in releases, as far as data assets uploaded by piggyback
are concerned. The data files piggyback
attaches to a Release can be deleted or modified at any time – creating a new release to store data assets is the functional equivalent of just creating new directories v0.1
, v0.2
to store your data. (GitHub Releases are always pinned to a particular git
tag, so the code/git-managed contents associated with repo are more immutable, but remember our data assets just piggyback on top of the repo).
Permanent, published data should always be archived in a proper data repository with a DOI, such as zenodo.org. Zenodo can freely archive public research data files up to 50 GB in size, and data is strictly versioned (once released, a DOI always refers to the same version of the data, new releases are given new DOIs). piggyback
is meant only to lower the friction of working with data during the research process. (e.g. provide data accessible to collaborators or continuous integration systems during research process, including for private repositories.)
GitHub documentation at the time of writing endorses the use of attachments to releases as a solution for distributing large files as part of your project:
- -Of course, it will be up to GitHub to decide if this use of release attachments is acceptable in the long term.
-Because larger (> 50 MB) data files cannot easily be committed to git, a different approach is required to manage data associated with an analysis in a GitHub repository. This package provides a simple work-around by allowing larger (up to 2 GB per file) data files to piggyback on a repository as assets attached to individual GitHub releases. These files are not handled by git in any way, but instead are uploaded, downloaded, or edited directly by calls through the GitHub API. These data files can be versioned manually by creating different releases. This approach works equally well with public or private repositories. Data can be uploaded and downloaded programmatically from scripts. No authentication is required to download data from public repositories.
-Install from CRAN via
-
-install.packages("piggyback")
You can install the development version from GitHub with:
-
-# install.packages("devtools")
-devtools::install_github("ropensci/piggyback")
See the piggyback vignette for details on authentication and additional package functionality.
-Piggyback can download data attached to a release on any repository:
-
-library(piggyback)
-pb_download("iris.tsv.gz", repo = "cboettig/piggyback-tests", dest = tempdir())
-#> Warning in pb_download("iris.tsv.gz", repo = "cboettig/piggyback-tests", :
-#> file(s) iris.tsv.gz not found in repo cboettig/piggyback-tests
Downloading from private repos or uploading to any repo requires authentication, so be sure to set a GITHUB_TOKEN
(or GITHUB_PAT
) environmental variable, or include the .token
argument. Omit the file name to download all attached objects. Omit the repository name to default to the current repository. See introductory vignette or function documentation for details.
We can also upload data to any existing release (defaults to latest
):
piggyback
acts like a poor soul’s Git LFS. Git LFS is not only expensive, it also breaks GitHub’s collaborative model – basically if someone wants to submit a PR with a simple edit to your docs, they cannot fork your repository since that would otherwise count against your Git LFS storage. Unlike Git LFS, piggyback
doesn’t take over your standard git
client, it just perches comfortably on the shoulders of your existing GitHub API. Data can be versioned by piggyback
, but relative to git LFS
versioning is less strict: uploads can be set as a new version or allowed to overwrite previously uploaded data.
GitHub documentation at the time of writing endorses the use of attachments to releases as a solution for distributing large files as part of your project:
- -Of course, it will be up to GitHub to decide if this use of release attachments is acceptable in the long term.
- -Also see our vignette comparing alternatives.
-Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.
- -NEWS.md
- pb_track()
, pb_push()
, and pb_pull()
which were removed as of version 0.0.0.9900pb_upload()
now handles the dir
argument to control relative path directories.gitcreds
and other use.pb_upload()
when creating a new tag in the process, previously data would be attached to the previously latest
tag instead of the newly created one.pb_download()
where httr would report a 401 status even after data successfully downloads.pb_info()
calls, inceasing default piggyback_cache_duration
to 10 minutes [#46]main
or without previous releases [#48]guess_repo()
now infers a remote when there are multiple associated with the repo. The “upstream” (preferred) or “origin” repo is selected if either exists, otherwise the function errors and asks the user to explicitly specify a repo (#31).release_info()
now works properly when there are no existing releases, which enables the usage of pb_new_release()
on repos without a release (#29).pb_info()
under certain cases which resulted in Error in a[[1]] : subscript out of bounds
, (#36)overwrite
behavior in pb_upload()
(#25)pb_download()
. (#24, #26)utils::askYesNo
which is only available in R >= 3.5.0Allow large and binary data files to “piggyback” on top of your existing repositories. push
and pull
large-ish (< 2GB) data files to & from GitHub repositories as attachments to a GitHub release;
Paste the full DESCRIPTION file inside a code block below:
Package: piggyback
-Version: 0.0.0.9000
-Title: Managing Larger Data on a GitHub Repository
-Description: Because larger (> 50 MB) data files cannot easily be committed to git,
- a different approach is required to manage data associated with an analysis in a
- GitHub repository. This package provides a simple work-around by allowing larger
- (up to 2 GB) data files to piggyback on a repository as assets attached to individual
- GitHub releases. These files are not handled by git in any way, but instead are
- uploaded, downloaded, or edited directly by calls through the GitHub API. These
- data files can be versioned manually by creating different releases. This approach
- works equally well with public or private repositories. Data can be uploaded
- and downloaded programmatically from scripts. No authentication is required to
- download data from public repositories.
-Authors@R: person("Carl", "Boettiger",
- email = "cboettig@gmail.com",
- role = c("aut", "cre", "cph"),
- comment=c(ORCID = "0000-0002-1642-628X"))
-URL: https://github.com/cboettig/piggyback
-BugReports: https://github.com/cboettig/piggyback/issues
-License: GPL-3
-Encoding: UTF-8
-LazyData: true
-ByteCompile: true
-Imports:
- gh,
- httr,
- jsonlite,
- git2r,
- fs,
- usethis,
- crayon,
- clisymbols
-Suggests:
- readr,
- covr,
- testthat,
- datasets,
- knitr,
- rmarkdown
-VignetteBuilder: knitr
-RoxygenNote: 6.0.1.9000
-Roxygen: list(markdown = TRUE)
-
-https://github.com/cboettig/piggyback
-reproducibility
, because accessing data being analyzed is essential for reproducible workflows, and yet we have no good solution for workflows with unpublished data or private workflows to do this once the data is too large for version control (e.g. files > 50 mb).
The target audience is anyone working with data files on GitHub.
-datastorr
on ropenscilabs
is the closest match, which takes a very different approach (from the user perspective – on the back end both store data on GitHub assets) to the essentially the same problem. The Intro vignette discusses at greater length many of the alternative possible strategies and why I feel they have all fallen short of my needs and led to me creating this package.
Confirm each of the following by checking the box. This package:
-paper.md
matching JOSS’s requirements with a high-level description in the package root or in inst/
.R CMD check
(or devtools::check()
) succeed? Paste and describe any errors or warnings:No errors, notes, or warnings.
-[x] Does the package conform to rOpenSci packaging guidelines? Please describe any exceptions:
If this is a resubmission following rejection, please explain the change in circumstances:
If possible, please provide recommendations of reviewers - those with experience with similar packages and/or likely users of your package - and their GitHub user names:
Rich FitzJohn, @richfitz, would be great based on his experience in this area and with datastorr
. Jenny Bryan, @Jennybc, since this package makes heavy use of usethis
and GitHub interactions.
GitHub has become a central component for preserving and sharing software-driven analysis in academic research [@Ram2013]. As scientists adopt this workflow, a desire to manage data associated with the analysis in the same manner soon emerges. While small data can easily be committed to GitHub repositories along-side source code and analysis scripts, files larger than 50 MB cannot. Existing work-arounds introduce significant complexity and break the ease of sharing [@Boettiger2018].
-This package provides a simple work-around by allowing larger (up to 2 GB) data files to piggyback on a repository as assets attached to individual GitHub releases. piggyback
provides a workflow similar to Git LFS [@GitLFS], in which data files can be tracked by type and pushed and pulled to GitHub with dedicated commands. These files are not handled by git in any way, but instead are uploaded, downloaded, or edited directly by calls through the GitHub API [@API3]. These data files can be versioned manually by creating different releases. This approach works equally well with public or private repositories. Data can be uploaded and downloaded programmatically from scripts. No authentication is required to download data from public repositories.
As long as a repository has at least one release, users can upload a set of specified files from the current repository to that release by simply passing the file names to pb_upload()
. Specify individual files to download using pb_download()
, or use no arguments to download all data files attached to the latest release. Alternatively, users can track files by a given pattern: for instance, pb_track("*.csv")
will track all *.csv
files in the repository. Then use pb_upload(pb_track())
to upload all currently tracked files. piggyback
compares timestamps to avoid unnecessary transfer. The piggyback
package looks for the same GITHUB_TOKEN
environmental variable for authentication that is used across GitHub APIs. Details are provided in an introductory vignette [@Boettiger2018b].
- All functions- - |
- |
---|---|
- - | -Delete an asset attached to a release |
-
- - | -Download data from an existing release |
-
- - | -Get the download url of a given file |
-
- - | -List all assets attached to a release |
-
- - | -Create a new release on GitHub repo |
-
- - | -Upload data to an existing release |
-
- - | -piggyback: Managing Larger Data on a GitHub Repository |
-
Delete an asset attached to a release
-pb_delete(
- file = NULL,
- repo = guess_repo(),
- tag = "latest",
- .token = get_token()
-)
file(s) to be deleted from the release. If NULL
(default
-when argument is omitted), function will delete all attachments to the release.
-delete
Repository name in format "owner/repo". Will guess the current -repo if not specified.
tag for the GitHub release to which this data should be attached.
GitHub authentication token, see [gh::gh_token()]
TRUE
(invisibly) if a file is found and deleted.
-Otherwise, returns NULL
(invisibly) if no file matching the name was found.
Download data from an existing release
-pb_download(
- file = NULL,
- dest = ".",
- repo = guess_repo(),
- tag = "latest",
- overwrite = TRUE,
- ignore = "manifest.json",
- use_timestamps = TRUE,
- show_progress = TRUE,
- .token = get_token()
-)
name or vector of names of files to be downloaded. If NULL
,
-all assets attached to the release will be downloaded.
name of vector of names of where file should be downloaded.
-Can be a directory or a list of filenames the same length as file
-vector. Any directories in the path provided must already exist.
Repository name in format "owner/repo". Will guess the current -repo if not specified.
tag for the GitHub release to which this data should be attached.
Should any local files of the same name be overwritten?
-default TRUE
.
a list of files to ignore (if downloading "all" because
-file=NULL
).
DEPRECATED.
logical, show a progress bar be shown for uploading?
-Defaults to TRUE
.
GitHub authentication token, see [gh::gh_token()]
if (FALSE) {
- ## Download a specific file.
- ## (dest can be omitted when run inside and R project)
- piggyback::pb_download("iris.tsv.gz",
- repo = "cboettig/piggyback-tests",
- dest = tempdir())
-}
-if (FALSE) {
- ## Download all files
- piggyback::pb_download(repo = "cboettig/piggyback-tests",
- dest = tempdir())
-
-}
-
Returns the URL download for a public file. This can be useful when writing
-scripts that may want to download the file directly without introducing any
-dependency on piggyback
or authentication steps.
pb_download_url(
- file = NULL,
- repo = guess_repo(),
- tag = "latest",
- .token = get_token()
-)
name or vector of names of files to be downloaded. If NULL
,
-all assets attached to the release will be downloaded.
Repository name in format "owner/repo". Will guess the current -repo if not specified.
tag for the GitHub release to which this data should be attached.
GitHub authentication token, see [gh::gh_token()]
the URL to download a file
-if (FALSE) {
-
-pb_download_url("iris.tsv.xz",
- repo = "cboettig/piggyback-tests",
- tag = "v0.0.1")
-
-}
-
List all assets attached to a release
-pb_list(
- repo = guess_repo(),
- tag = NULL,
- ignore = "manifest.json",
- .token = get_token()
-)
Repository name in format "owner/repo". Will guess the current -repo if not specified.
which release tag do we want information for? If NULL
(default),
-will return a table for all available release tags.
a list of files to ignore (if downloading "all" because
-file=NULL
).
GitHub authentication token, see [gh::gh_token()]
a data.frame of release asset names, (normalized to local paths), release tag, -timestamp, owner, and repo.
-To preserve path information, local path delimiters are converted to .2f
-when files are uploaded as assets. Listing will display the local filename,
-with asset names converting the .2f
escape code back to the system delimiter.
if (FALSE) {
-pb_list("cboettig/piggyback-tests")
-}
-
Create a new release on GitHub repo
-pb_new_release(
- repo = guess_repo(),
- tag,
- commit = NULL,
- name = tag,
- body = "Data release",
- draft = FALSE,
- prerelease = FALSE,
- .token = get_token()
-)
Repository name in format "owner/repo". Will guess -the current repo if not specified.
tag to create for this release
Specifies the commit-ish value that
-determines where the Git tag is created from.
-Can be any branch or commit SHA. Unused if the
-git tag already exists. Default: the repository's
-default branch (usually master
).
The name of the release. Defaults to tag.
Text describing the contents of the tag. -default text is "Data release".
default FALSE
. Set to TRUE
to create
-a draft (unpublished) release.
default FALSE
. Set to TRUE
to
-identify the release as a pre-release.
GitHub authentication token, see [gh::gh_token()]
if (FALSE) {
-pb_new_release("cboettig/piggyback-tests", "v0.0.5")
-}
-
Download any tracked datasets piggybacking on GitHub. Files identical on -local and remote versions will not be transferred. Otherwise, assumes -GitHub version should overwrite local versions.
- -pb_pull(repo = guess_repo(), tag = "latest", overwrite = TRUE, - manifest = ".manifest.json")- -
repo | -Name of the repo on GitHub ( |
-
---|---|
tag | -name of release/tag on GitHub to which data assets will be -attached. Default is to use the latest available release. |
-
overwrite | -should existing files be overwritten when hashes do
-not match? default |
-
manifest | -name of the local manifest file. Note: A leading dot -(i.e. indicating a hidden file) in the manifest name will be removed -from the name used on the GitHub asset list. |
-
Will only download tracked files, as identified by the manifest
-attached to the requested release on GitHub. Add files to tracking with
-pb_track
first and push to GitHub with pb_push
.
# NOT RUN { -pb_pull() -# }-
Push all currently tracked data files to GitHub. Only files identical -to those already on GitHub (by md5sum hash) will not be transferred. -Otherwise, assumes local version should overwrite existing GitHub -version. Create a new release if you do not want to overwrite previous -GitHub versions when pushing.
- -pb_push(repo = guess_repo(), tag = "latest", overwrite = TRUE, - manifest = ".manifest.json")- -
repo | -Name of the repo on GitHub ( |
-
---|---|
tag | -name of release/tag on GitHub to which data assets will be -attached. Default is to use the latest available release. |
-
overwrite | -should existing files be overwritten when hashes do
-not match? default |
-
manifest | -name of the local manifest file. Note: A leading dot -(i.e. indicating a hidden file) in the manifest name will be removed -from the name used on the GitHub asset list. |
-
Will only upload tracked files, as identified by the local
-manifest. Add files to tracking with pb_track
first.
# NOT RUN { -pb_push() -# }-
Track data files of a given pattern or location
- -pb_track(glob = NULL, repo_root = usethis::proj_get())- -
glob | -vector of file names and/or glob pattern (e.g. |
-
---|---|
repo_root | -repository root, will be guessed by |
-
list of tracked files (invisibly)
- -Note: tracked patterns are simply written to .pbattributes
-(analogous to .gitattributes
in git-lfs
.) You can also edit this
-file manually. You will probably want to check in .psattributes
to
-as to version control., with git add .psattributes
. Note that
-tracked file patterns will also be added to .gitignore
.
# NOT RUN { -## Track all .csv and .tsv files -pb_track(c("*.tsv", "*.tsv.gz")) - -# }-
NOTE: you must first create a release if one does not already exists.
-pb_upload(
- file,
- repo = guess_repo(),
- tag = "latest",
- name = NULL,
- overwrite = "use_timestamps",
- use_timestamps = NULL,
- show_progress = TRUE,
- .token = get_token(),
- dir = "."
-)
path to file to be uploaded
Repository name in format "owner/repo". Will guess the current -repo if not specified.
tag for the GitHub release to which this data should be attached.
name for uploaded file. If not provided will use the basename of
-file
(i.e. filename without directory)
overwrite any existing file with the same name already -attached to the on release? Default behavior is based on timestamps, -only overwriting those files which are older.
DEPRECATED.
logical, show a progress bar be shown for uploading?
-Defaults to TRUE
.
GitHub authentication token, see [gh::gh_token()]
directory relative to which file names should be based.
if (FALSE) {
-# Needs your real token to run
-
-readr::write_tsv(mtcars,"mtcars.tsv.xz")
-pb_upload("mtcars.tsv.xz", "cboettig/piggyback-tests")
-}
-
R/piggyback.R
- piggyback-package.Rd
Because larger (> 50 MB) data files cannot easily be committed to git, -a different approach is required to manage data associated with an analysis in a -GitHub repository. This package provides a simple work-around by allowing larger -(up to 2 GB) data files to piggyback on a repository as assets attached to individual -GitHub releases. These files are not handled by git in any way, but instead are -uploaded, downloaded, or edited directly by calls through the GitHub API. These -data files can be versioned manually by creating different releases. This approach -works equally well with public or private repositories. Data can be uploaded -and downloaded programmatically from scripts. No authentication is required to -download data from public repositories.
-It has two main modes or workflows:
pb_upload()
/ pb_download()
: Upload and download individual files to/from
-the desired release of the specified repository
Useful links: