diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv index 42e3077f261..795efbe256d 100644 --- a/docker/depends/pecan_package_dependencies.csv +++ b/docker/depends/pecan_package_dependencies.csv @@ -613,9 +613,9 @@ "testthat",">= 2.0.0","base/utils","Suggests",FALSE "testthat",">= 2.0.0","models/biocro","Suggests",FALSE "testthat",">= 2.0.0","modules/benchmark","Suggests",FALSE -"testthat",">= 2.0.0","modules/data.atmosphere","Suggests",FALSE "testthat",">= 3.0.0","models/sibcasa","Suggests",FALSE "testthat",">= 3.0.4","base/qaqc","Suggests",FALSE +"testthat",">= 3.1.7","modules/data.atmosphere","Suggests",FALSE "tibble","*","base/db","Imports",FALSE "tibble","*","models/ed","Imports",FALSE "tibble","*","models/fates","Imports",FALSE diff --git a/modules/data.atmosphere/DESCRIPTION b/modules/data.atmosphere/DESCRIPTION index 32ecfce8a76..fceb2f2d7f1 100644 --- a/modules/data.atmosphere/DESCRIPTION +++ b/modules/data.atmosphere/DESCRIPTION @@ -69,7 +69,7 @@ Suggests: PEcAn.settings, progress, reticulate, - testthat (>= 2.0.0), + testthat (>= 3.1.7), withr Remotes: github::adokter/suntools, diff --git a/modules/data.atmosphere/NEWS.md b/modules/data.atmosphere/NEWS.md index ecd7801d184..13ca4a1b686 100644 --- a/modules/data.atmosphere/NEWS.md +++ b/modules/data.atmosphere/NEWS.md @@ -1,5 +1,8 @@ # PEcAn.data.atmosphere 1.8.0.9000 +## Fixed + +* `download.AmerifluxLBL` no longer wrongly re-fetches raw zipfiles when `overwrite = FALSE` # PEcAn.data.atmosphere 1.8.0 diff --git a/modules/data.atmosphere/R/download.AmerifluxLBL.R b/modules/data.atmosphere/R/download.AmerifluxLBL.R index 677266ffa8a..8a2ff965f3f 100644 --- a/modules/data.atmosphere/R/download.AmerifluxLBL.R +++ b/modules/data.atmosphere/R/download.AmerifluxLBL.R @@ -4,11 +4,11 @@ ##' to download a zip-file of data. The zip-file is extracted to a csv-file that is stored ##' to the given outfolder. Details about amf_download_base function can be found here: ##' https://github.com/chuhousen/amerifluxr/blob/master/R/amf_download_base.R -##' +##' ##' Uses Ameriflux LBL JSON API to download met data from Ameriflux towers in CSV format -##' +##' ##' @export -##' @param sitename the Ameriflux ID of the site to be downloaded, used as file name prefix. +##' @param sitename the Ameriflux ID of the site to be downloaded, used as file name prefix. ##' The 'SITE_ID' field in \href{http://ameriflux.lbl.gov/sites/site-list-and-pages/}{list of Ameriflux sites} ##' @param outfolder location on disk where outputs will be stored ##' @param start_date the start date of the data to be downloaded. Format is YYYY-MM-DD (will only use the year part of the date) @@ -26,115 +26,118 @@ ##' \dontrun{ ##' result <- download.AmerifluxLBL("US-Akn","~/","2011-01-01","2011-12-31",overwrite=TRUE) ##' } -##' +##' ##' @author Ankur Desai, Henri Kajasilta based on download.Ameriflux.R by Josh Mantooth, Rob Kooper, Shawn Serbin download.AmerifluxLBL <- function(sitename, outfolder, start_date, end_date, overwrite = FALSE, verbose = FALSE, username = "pecan", method, useremail = "@", data_product = "BASE-BADM", data_policy = "CCBY4.0", ...) { - + # Initial set-ups for amerifluxr packages # get start/end year code works on whole years only start_date <- as.POSIXlt(start_date, tz = "UTC") end_date <- as.POSIXlt(end_date, tz = "UTC") - + start_year <- lubridate::year(start_date) end_year <- lubridate::year(end_date) - - + + site <- sub(".* \\((.*)\\)", "\\1", sitename) - - - - - + + # make sure output folder exists if (!file.exists(outfolder)) { dir.create(outfolder, showWarnings = FALSE, recursive = TRUE) } - - - - repeat { - tout <- options("timeout") - zip_file <- try(amerifluxr::amf_download_base(user_id = username, - user_email = useremail, - site_id = site, - data_product = data_product, - data_policy = data_policy, - agree_policy = TRUE, - intended_use = "model", - intended_use_text = "PEcAn download", - verbose = verbose, - out_dir = outfolder) - ) - if (!inherits(zip_file, "try-error")){ - break - }else if(tout$timeout > 250 ){ - PEcAn.logger::logger.severe("Download takes too long, check your connection.") - break + + version <- amerifluxr::amf_var_info() + version <- unique(version[version$Site_ID == site,]$BASE_Version) + if (length(version) != 1) { + PEcAn.logger::logger.error("Could not find AmerifluxLBL version info for site", site) + } + expected_filename <- paste0("AMF_", site, "_", data_product, "_", version, ".zip") + expected_fullpath <- file.path(outfolder, expected_filename) + + if (!overwrite && file.exists(expected_fullpath)) { + PEcAn.logger::logger.debug("File '", expected_filename, "' already exists, skipping download") + zip_file <- expected_fullpath + } else { + repeat { + tout <- getOption("timeout") + zip_file <- try( + amerifluxr::amf_download_base( + user_id = username, + user_email = useremail, + site_id = site, + data_product = data_product, + data_policy = data_policy, + agree_policy = TRUE, + intended_use = "model", + intended_use_text = "PEcAn download", + verbose = verbose, + out_dir = outfolder) + ) + if (!inherits(zip_file, "try-error")){ + break + }else if(tout > 250 ){ + PEcAn.logger::logger.severe("Download takes too long, check your connection.") + break + } + PEcAn.logger::logger.info("Added 100 seconds before the download timeouts") + options(timeout = tout + 100) } - PEcAn.logger::logger.info("Added 100 seconds before the download timeouts") - options(timeout = tout$timeout + 100) } - - - - + + # Path to created zip-file - ftplink <- zip_file - if(!grepl(".zip", ftplink)){ - PEcAn.logger::logger.info("Not able to download a zip-file. Check download.AmerifluxLBL inputs") + if(!grepl(".zip", zip_file)){ + PEcAn.logger::logger.error("Not able to download a zip-file. Check download.AmerifluxLBL inputs") } - + # get zip and csv filenames - outfname <- strsplit(ftplink, "/") - outfname <- outfname[[1]][length(outfname[[1]])] - output_zip_file <- file.path(outfolder, outfname) + outfname <- basename(zip_file) + + if (outfname != expected_filename) { + PEcAn.logger::logger.info( + "Downloaded a file named", sQuote(outfname), + "but download.AmerifluxLBL was expecting", sQuote(expected_filename), ". This may be a PEcAn bug.") + } + file_timestep_hh <- "HH" file_timestep_hr <- "HR" file_timestep <- file_timestep_hh - + + # TODO: Are values of data_product other than "BASE-BADM" actually supported? + # If not, this whole block reduces to + # outcsvname <- sub("-BADM(.*).zip", "_HH\\1.csv", outfname) + # And: if other values _are_ supported, are the fixed-length substrings below + # still correct? endname <- strsplit(outfname, "_") endname <- endname[[1]][length(endname[[1]])] - endname <- gsub("\\..*", "", endname) + endname <- gsub("\\..*", "", endname) outcsvname <- paste0(substr(outfname, 1, 15), "_", file_timestep_hh, "_", endname, ".csv") output_csv_file <- file.path(outfolder, outcsvname) outcsvname_hr <- paste0(substr(outfname, 1, 15), "_", file_timestep_hr, "_", endname, ".csv") output_csv_file_hr <- file.path(outfolder, outcsvname_hr) - - download_file_flag <- TRUE + extract_file_flag <- TRUE - if (!overwrite && file.exists(output_zip_file)) { - PEcAn.logger::logger.debug("File '", output_zip_file, "' already exists, skipping download") - download_file_flag <- FALSE - } if (!overwrite && file.exists(output_csv_file)) { PEcAn.logger::logger.debug("File '", output_csv_file, "' already exists, skipping extraction.") - download_file_flag <- FALSE extract_file_flag <- FALSE file_timestep <- "HH" } else { if (!overwrite && file.exists(output_csv_file_hr)) { PEcAn.logger::logger.debug("File '", output_csv_file_hr, "' already exists, skipping extraction.") - download_file_flag <- FALSE extract_file_flag <- FALSE file_timestep <- "HR" outcsvname <- outcsvname_hr output_csv_file <- output_csv_file_hr } } - - if (download_file_flag) { - extract_file_flag <- TRUE - PEcAn.utils::download_file(ftplink, output_zip_file, method) - if (!file.exists(output_zip_file)) { - PEcAn.logger::logger.severe("FTP did not download ", output_zip_file, " from ", ftplink) - } - } + if (extract_file_flag) { - avail_file <- utils::unzip(output_zip_file, list = TRUE) + avail_file <- utils::unzip(zip_file, list = TRUE) if (length(grep("HH", avail_file)) > 0) { file_timestep <- "HH" } else { @@ -143,62 +146,62 @@ download.AmerifluxLBL <- function(sitename, outfolder, start_date, end_date, output_csv_file <- output_csv_file_hr outcsvname <- outcsvname_hr } else { - PEcAn.logger::logger.severe("Half-hourly or Hourly data file was not found in ", output_zip_file) + PEcAn.logger::logger.severe("Half-hourly or Hourly data file was not found in ", zip_file) } } - utils::unzip(output_zip_file, outcsvname, exdir = outfolder) + utils::unzip(zip_file, outcsvname, exdir = outfolder) if (!file.exists(output_csv_file)) { - PEcAn.logger::logger.severe("ZIP file ", output_zip_file, " did not contain CSV file ", outcsvname) + PEcAn.logger::logger.severe("ZIP file ", zip_file, " did not contain CSV file ", outcsvname) } } - + dbfilename <- paste0(substr(outfname, 1, 15), "_", file_timestep, "_", endname) - + # get start and end year of data from file firstline <- system(paste0("head -4 ", output_csv_file), intern = TRUE) firstline <- firstline[4] lastline <- system(paste0("tail -1 ", output_csv_file), intern = TRUE) - - firstdate_st <- paste0(substr(firstline, 1, 4), "-", + + firstdate_st <- paste0(substr(firstline, 1, 4), "-", substr(firstline, 5, 6), "-", substr(firstline, 7, 8), " ", substr(firstline, 9, 10), ":", substr(firstline, 11, 12)) firstdate <- as.POSIXlt(firstdate_st) - lastdate_st <- paste0(substr(lastline, 1, 4), "-", - substr(lastline, 5, 6), "-", - substr(lastline, 7, 8), " ", - substr(lastline, 9, 10), ":", + lastdate_st <- paste0(substr(lastline, 1, 4), "-", + substr(lastline, 5, 6), "-", + substr(lastline, 7, 8), " ", + substr(lastline, 9, 10), ":", substr(lastline, 11, 12)) lastdate <- as.POSIXlt(lastdate_st) - + syear <- lubridate::year(firstdate) eyear <- lubridate::year(lastdate) - + if (start_year > eyear) { PEcAn.logger::logger.severe("Start_Year", start_year, "exceeds end of record ", eyear, " for ", site) } if (end_year < syear) { PEcAn.logger::logger.severe("End_Year", end_year, "precedes start of record ", syear, " for ", site) } - + rows <- 1 - results <- data.frame(file = character(rows), - host = character(rows), - mimetype = character(rows), - formatname = character(rows), - startdate = character(rows), - enddate = character(rows), - dbfile.name = dbfilename, + results <- data.frame(file = character(rows), + host = character(rows), + mimetype = character(rows), + formatname = character(rows), + startdate = character(rows), + enddate = character(rows), + dbfile.name = dbfilename, stringsAsFactors = FALSE) - + results$file[rows] <- output_csv_file results$host[rows] <- PEcAn.remote::fqdn() results$startdate[rows] <- firstdate_st results$enddate[rows] <- lastdate_st results$mimetype[rows] <- "text/csv" results$formatname[rows] <- "AMERIFLUX_BASE_HH" - + # return list of files downloaded return(results) } # download.AmerifluxLBL diff --git a/modules/data.atmosphere/man/download.AmerifluxLBL.Rd b/modules/data.atmosphere/man/download.AmerifluxLBL.Rd index 9b8fa2290ad..813388c04b6 100644 --- a/modules/data.atmosphere/man/download.AmerifluxLBL.Rd +++ b/modules/data.atmosphere/man/download.AmerifluxLBL.Rd @@ -20,7 +20,7 @@ download.AmerifluxLBL( ) } \arguments{ -\item{sitename}{the Ameriflux ID of the site to be downloaded, used as file name prefix. +\item{sitename}{the Ameriflux ID of the site to be downloaded, used as file name prefix. The 'SITE_ID' field in \href{http://ameriflux.lbl.gov/sites/site-list-and-pages/}{list of Ameriflux sites}} \item{outfolder}{location on disk where outputs will be stored} diff --git a/modules/data.atmosphere/tests/testthat/test.download.AmerifluxLBL.R b/modules/data.atmosphere/tests/testthat/test.download.AmerifluxLBL.R new file mode 100644 index 00000000000..4ca10b43afe --- /dev/null +++ b/modules/data.atmosphere/tests/testthat/test.download.AmerifluxLBL.R @@ -0,0 +1,76 @@ + +test_that("download respects overwrite argument", { + outdir <- withr::local_tempdir() + zippath <- file.path(outdir, "AMF_US-Akn_BASE-BADM_6-5.zip") + csvpath <- sub("-BADM(.*).zip", "_HH\\1.csv", zippath) + + # Mock out amerifluxr functions to test our code without network calls + local_mocked_bindings( + amf_download_base = \(...) { + tmp_csv <- basename(csvpath) + withr::with_tempdir({ + writeLines( + c( "# fake file", + "#", + "TIMESTAMP_START,TIMESTAMP_END", + "201101010000,201101010030", + "201112310000,201112310030"), + tmp_csv) + zip(zippath, tmp_csv, flags = "-qr0X")}) + zippath + }, + amf_var_info = \(...) data.frame( + Site_ID = "US-Akn", + BASE_Version = "6-5"), + .package = "amerifluxr" + ) + + # wrapper, just to skip retyping args + dl_akn <- function(...) download.AmerifluxLBL( + site = "US-Akn", + outfolder = outdir, + start_date = "2011-01-01", + end_date = "2011-10-01", + ...) + + # Case 0: new download + expect_false(file.exists(zippath)) + expect_false(file.exists(csvpath)) + dl_akn() + expect_true(file.exists(zippath)) + expect_true(file.exists(csvpath)) + + + # Case 1: reuse existing download + ziptime <- file.mtime(zippath) + csvtime <- file.mtime(csvpath) + expect_log( + dl_akn(overwrite = FALSE), + "skipping download.*skipping extraction") + expect_equal(file.mtime(zippath), ziptime) + expect_equal(file.mtime(csvpath), csvtime) + + # Case 2: overwrite existing download + dl_akn(overwrite = TRUE) + expect_gt(file.mtime(zippath), ziptime) + expect_gt(file.mtime(csvpath), csvtime) + + # Case 3: Freshen csv without clobbering zip + file.remove(csvpath) + ziptime <- file.mtime(zippath) + expect_log(dl_akn(overwrite = FALSE), "skipping download") + expect_true(file.exists(csvpath)) + expect_equal(file.mtime(zippath), ziptime) + + # Case 4: Re-download zip without clobbering CSV + # (Note: I'm not sure this is desirable! For consistency it may be better + # to overwrite the CSV so we know it matches the zip file. + # If you change the behavior, go ahead and update this test to match.) + file.remove(zippath) + csvtime <- file.mtime(csvpath) + expect_log( + dl_akn(overwrite = FALSE), + "skipping extraction") + expect_true(file.exists(zippath)) + expect_equal(file.mtime(csvpath), csvtime) +})