Skip to content

Commit

Permalink
download.AmerifluxLBL: respect overwrite argument (#3382)
Browse files Browse the repository at this point in the history
* doanload.AmerifluxLBL: respect `overwrite` argument

* guess I was testing on a site with only one var

* deps
  • Loading branch information
infotroph authored Sep 23, 2024
1 parent fbac460 commit d136f48
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 92 deletions.
2 changes: 1 addition & 1 deletion docker/depends/pecan_package_dependencies.csv
Original file line number Diff line number Diff line change
Expand Up @@ -613,9 +613,9 @@
"testthat",">= 2.0.0","base/utils","Suggests",FALSE
"testthat",">= 2.0.0","models/biocro","Suggests",FALSE
"testthat",">= 2.0.0","modules/benchmark","Suggests",FALSE
"testthat",">= 2.0.0","modules/data.atmosphere","Suggests",FALSE
"testthat",">= 3.0.0","models/sibcasa","Suggests",FALSE
"testthat",">= 3.0.4","base/qaqc","Suggests",FALSE
"testthat",">= 3.1.7","modules/data.atmosphere","Suggests",FALSE
"tibble","*","base/db","Imports",FALSE
"tibble","*","models/ed","Imports",FALSE
"tibble","*","models/fates","Imports",FALSE
Expand Down
2 changes: 1 addition & 1 deletion modules/data.atmosphere/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Suggests:
PEcAn.settings,
progress,
reticulate,
testthat (>= 2.0.0),
testthat (>= 3.1.7),
withr
Remotes:
github::adokter/suntools,
Expand Down
3 changes: 3 additions & 0 deletions modules/data.atmosphere/NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# PEcAn.data.atmosphere 1.8.0.9000

## Fixed

* `download.AmerifluxLBL` no longer wrongly re-fetches raw zipfiles when `overwrite = FALSE`

# PEcAn.data.atmosphere 1.8.0

Expand Down
181 changes: 92 additions & 89 deletions modules/data.atmosphere/R/download.AmerifluxLBL.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
##' to download a zip-file of data. The zip-file is extracted to a csv-file that is stored
##' to the given outfolder. Details about amf_download_base function can be found here:
##' https://github.com/chuhousen/amerifluxr/blob/master/R/amf_download_base.R
##'
##'
##' Uses Ameriflux LBL JSON API to download met data from Ameriflux towers in CSV format
##'
##'
##' @export
##' @param sitename the Ameriflux ID of the site to be downloaded, used as file name prefix.
##' @param sitename the Ameriflux ID of the site to be downloaded, used as file name prefix.
##' The 'SITE_ID' field in \href{http://ameriflux.lbl.gov/sites/site-list-and-pages/}{list of Ameriflux sites}
##' @param outfolder location on disk where outputs will be stored
##' @param start_date the start date of the data to be downloaded. Format is YYYY-MM-DD (will only use the year part of the date)
Expand All @@ -26,115 +26,118 @@
##' \dontrun{
##' result <- download.AmerifluxLBL("US-Akn","~/","2011-01-01","2011-12-31",overwrite=TRUE)
##' }
##'
##'
##' @author Ankur Desai, Henri Kajasilta based on download.Ameriflux.R by Josh Mantooth, Rob Kooper, Shawn Serbin


download.AmerifluxLBL <- function(sitename, outfolder, start_date, end_date,
overwrite = FALSE, verbose = FALSE, username = "pecan", method,
useremail = "@", data_product = "BASE-BADM", data_policy = "CCBY4.0", ...) {

# Initial set-ups for amerifluxr packages
# get start/end year code works on whole years only
start_date <- as.POSIXlt(start_date, tz = "UTC")
end_date <- as.POSIXlt(end_date, tz = "UTC")

start_year <- lubridate::year(start_date)
end_year <- lubridate::year(end_date)


site <- sub(".* \\((.*)\\)", "\\1", sitename)







# make sure output folder exists
if (!file.exists(outfolder)) {
dir.create(outfolder, showWarnings = FALSE, recursive = TRUE)
}



repeat {
tout <- options("timeout")
zip_file <- try(amerifluxr::amf_download_base(user_id = username,
user_email = useremail,
site_id = site,
data_product = data_product,
data_policy = data_policy,
agree_policy = TRUE,
intended_use = "model",
intended_use_text = "PEcAn download",
verbose = verbose,
out_dir = outfolder)
)
if (!inherits(zip_file, "try-error")){
break
}else if(tout$timeout > 250 ){
PEcAn.logger::logger.severe("Download takes too long, check your connection.")
break

version <- amerifluxr::amf_var_info()
version <- unique(version[version$Site_ID == site,]$BASE_Version)
if (length(version) != 1) {
PEcAn.logger::logger.error("Could not find AmerifluxLBL version info for site", site)
}
expected_filename <- paste0("AMF_", site, "_", data_product, "_", version, ".zip")
expected_fullpath <- file.path(outfolder, expected_filename)

if (!overwrite && file.exists(expected_fullpath)) {
PEcAn.logger::logger.debug("File '", expected_filename, "' already exists, skipping download")
zip_file <- expected_fullpath
} else {
repeat {
tout <- getOption("timeout")
zip_file <- try(
amerifluxr::amf_download_base(
user_id = username,
user_email = useremail,
site_id = site,
data_product = data_product,
data_policy = data_policy,
agree_policy = TRUE,
intended_use = "model",
intended_use_text = "PEcAn download",
verbose = verbose,
out_dir = outfolder)
)
if (!inherits(zip_file, "try-error")){
break
}else if(tout > 250 ){
PEcAn.logger::logger.severe("Download takes too long, check your connection.")
break
}
PEcAn.logger::logger.info("Added 100 seconds before the download timeouts")
options(timeout = tout + 100)
}
PEcAn.logger::logger.info("Added 100 seconds before the download timeouts")
options(timeout = tout$timeout + 100)
}






# Path to created zip-file
ftplink <- zip_file
if(!grepl(".zip", ftplink)){
PEcAn.logger::logger.info("Not able to download a zip-file. Check download.AmerifluxLBL inputs")
if(!grepl(".zip", zip_file)){
PEcAn.logger::logger.error("Not able to download a zip-file. Check download.AmerifluxLBL inputs")
}

# get zip and csv filenames
outfname <- strsplit(ftplink, "/")
outfname <- outfname[[1]][length(outfname[[1]])]
output_zip_file <- file.path(outfolder, outfname)
outfname <- basename(zip_file)

if (outfname != expected_filename) {
PEcAn.logger::logger.info(
"Downloaded a file named", sQuote(outfname),
"but download.AmerifluxLBL was expecting", sQuote(expected_filename), ". This may be a PEcAn bug.")
}

file_timestep_hh <- "HH"
file_timestep_hr <- "HR"
file_timestep <- file_timestep_hh


# TODO: Are values of data_product other than "BASE-BADM" actually supported?
# If not, this whole block reduces to
# outcsvname <- sub("-BADM(.*).zip", "_HH\\1.csv", outfname)
# And: if other values _are_ supported, are the fixed-length substrings below
# still correct?
endname <- strsplit(outfname, "_")
endname <- endname[[1]][length(endname[[1]])]
endname <- gsub("\\..*", "", endname)
endname <- gsub("\\..*", "", endname)
outcsvname <- paste0(substr(outfname, 1, 15), "_", file_timestep_hh, "_", endname, ".csv")
output_csv_file <- file.path(outfolder, outcsvname)
outcsvname_hr <- paste0(substr(outfname, 1, 15), "_", file_timestep_hr, "_", endname, ".csv")
output_csv_file_hr <- file.path(outfolder, outcsvname_hr)

download_file_flag <- TRUE

extract_file_flag <- TRUE
if (!overwrite && file.exists(output_zip_file)) {
PEcAn.logger::logger.debug("File '", output_zip_file, "' already exists, skipping download")
download_file_flag <- FALSE
}
if (!overwrite && file.exists(output_csv_file)) {
PEcAn.logger::logger.debug("File '", output_csv_file, "' already exists, skipping extraction.")
download_file_flag <- FALSE
extract_file_flag <- FALSE
file_timestep <- "HH"
} else {
if (!overwrite && file.exists(output_csv_file_hr)) {
PEcAn.logger::logger.debug("File '", output_csv_file_hr, "' already exists, skipping extraction.")
download_file_flag <- FALSE
extract_file_flag <- FALSE
file_timestep <- "HR"
outcsvname <- outcsvname_hr
output_csv_file <- output_csv_file_hr
}
}

if (download_file_flag) {
extract_file_flag <- TRUE
PEcAn.utils::download_file(ftplink, output_zip_file, method)
if (!file.exists(output_zip_file)) {
PEcAn.logger::logger.severe("FTP did not download ", output_zip_file, " from ", ftplink)
}
}

if (extract_file_flag) {
avail_file <- utils::unzip(output_zip_file, list = TRUE)
avail_file <- utils::unzip(zip_file, list = TRUE)
if (length(grep("HH", avail_file)) > 0) {
file_timestep <- "HH"
} else {
Expand All @@ -143,62 +146,62 @@ download.AmerifluxLBL <- function(sitename, outfolder, start_date, end_date,
output_csv_file <- output_csv_file_hr
outcsvname <- outcsvname_hr
} else {
PEcAn.logger::logger.severe("Half-hourly or Hourly data file was not found in ", output_zip_file)
PEcAn.logger::logger.severe("Half-hourly or Hourly data file was not found in ", zip_file)
}
}
utils::unzip(output_zip_file, outcsvname, exdir = outfolder)
utils::unzip(zip_file, outcsvname, exdir = outfolder)
if (!file.exists(output_csv_file)) {
PEcAn.logger::logger.severe("ZIP file ", output_zip_file, " did not contain CSV file ", outcsvname)
PEcAn.logger::logger.severe("ZIP file ", zip_file, " did not contain CSV file ", outcsvname)
}
}

dbfilename <- paste0(substr(outfname, 1, 15), "_", file_timestep, "_", endname)

# get start and end year of data from file
firstline <- system(paste0("head -4 ", output_csv_file), intern = TRUE)
firstline <- firstline[4]
lastline <- system(paste0("tail -1 ", output_csv_file), intern = TRUE)
firstdate_st <- paste0(substr(firstline, 1, 4), "-",

firstdate_st <- paste0(substr(firstline, 1, 4), "-",
substr(firstline, 5, 6), "-",
substr(firstline, 7, 8), " ",
substr(firstline, 9, 10), ":",
substr(firstline, 11, 12))
firstdate <- as.POSIXlt(firstdate_st)
lastdate_st <- paste0(substr(lastline, 1, 4), "-",
substr(lastline, 5, 6), "-",
substr(lastline, 7, 8), " ",
substr(lastline, 9, 10), ":",
lastdate_st <- paste0(substr(lastline, 1, 4), "-",
substr(lastline, 5, 6), "-",
substr(lastline, 7, 8), " ",
substr(lastline, 9, 10), ":",
substr(lastline, 11, 12))
lastdate <- as.POSIXlt(lastdate_st)

syear <- lubridate::year(firstdate)
eyear <- lubridate::year(lastdate)

if (start_year > eyear) {
PEcAn.logger::logger.severe("Start_Year", start_year, "exceeds end of record ", eyear, " for ", site)
}
if (end_year < syear) {
PEcAn.logger::logger.severe("End_Year", end_year, "precedes start of record ", syear, " for ", site)
}

rows <- 1
results <- data.frame(file = character(rows),
host = character(rows),
mimetype = character(rows),
formatname = character(rows),
startdate = character(rows),
enddate = character(rows),
dbfile.name = dbfilename,
results <- data.frame(file = character(rows),
host = character(rows),
mimetype = character(rows),
formatname = character(rows),
startdate = character(rows),
enddate = character(rows),
dbfile.name = dbfilename,
stringsAsFactors = FALSE)

results$file[rows] <- output_csv_file
results$host[rows] <- PEcAn.remote::fqdn()
results$startdate[rows] <- firstdate_st
results$enddate[rows] <- lastdate_st
results$mimetype[rows] <- "text/csv"
results$formatname[rows] <- "AMERIFLUX_BASE_HH"

# return list of files downloaded
return(results)
} # download.AmerifluxLBL
2 changes: 1 addition & 1 deletion modules/data.atmosphere/man/download.AmerifluxLBL.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@

test_that("download respects overwrite argument", {
outdir <- withr::local_tempdir()
zippath <- file.path(outdir, "AMF_US-Akn_BASE-BADM_6-5.zip")
csvpath <- sub("-BADM(.*).zip", "_HH\\1.csv", zippath)

# Mock out amerifluxr functions to test our code without network calls
local_mocked_bindings(
amf_download_base = \(...) {
tmp_csv <- basename(csvpath)
withr::with_tempdir({
writeLines(
c( "# fake file",
"#",
"TIMESTAMP_START,TIMESTAMP_END",
"201101010000,201101010030",
"201112310000,201112310030"),
tmp_csv)
zip(zippath, tmp_csv, flags = "-qr0X")})
zippath
},
amf_var_info = \(...) data.frame(
Site_ID = "US-Akn",
BASE_Version = "6-5"),
.package = "amerifluxr"
)

# wrapper, just to skip retyping args
dl_akn <- function(...) download.AmerifluxLBL(
site = "US-Akn",
outfolder = outdir,
start_date = "2011-01-01",
end_date = "2011-10-01",
...)

# Case 0: new download
expect_false(file.exists(zippath))
expect_false(file.exists(csvpath))
dl_akn()
expect_true(file.exists(zippath))
expect_true(file.exists(csvpath))


# Case 1: reuse existing download
ziptime <- file.mtime(zippath)
csvtime <- file.mtime(csvpath)
expect_log(
dl_akn(overwrite = FALSE),
"skipping download.*skipping extraction")
expect_equal(file.mtime(zippath), ziptime)
expect_equal(file.mtime(csvpath), csvtime)

# Case 2: overwrite existing download
dl_akn(overwrite = TRUE)
expect_gt(file.mtime(zippath), ziptime)
expect_gt(file.mtime(csvpath), csvtime)

# Case 3: Freshen csv without clobbering zip
file.remove(csvpath)
ziptime <- file.mtime(zippath)
expect_log(dl_akn(overwrite = FALSE), "skipping download")
expect_true(file.exists(csvpath))
expect_equal(file.mtime(zippath), ziptime)

# Case 4: Re-download zip without clobbering CSV
# (Note: I'm not sure this is desirable! For consistency it may be better
# to overwrite the CSV so we know it matches the zip file.
# If you change the behavior, go ahead and update this test to match.)
file.remove(zippath)
csvtime <- file.mtime(csvpath)
expect_log(
dl_akn(overwrite = FALSE),
"skipping extraction")
expect_true(file.exists(zippath))
expect_equal(file.mtime(csvpath), csvtime)
})

0 comments on commit d136f48

Please sign in to comment.