download.AmerifluxLBL: respect overwrite argument (#3382)

* doanload.AmerifluxLBL: respect `overwrite` argument * guess I was testing on a site with only one var * deps
PecanProject · Sep 23, 2024 · d136f48 · d136f48
1 parent fbac460
commit d136f48
Show file tree

Hide file tree

Showing 6 changed files with 174 additions and 92 deletions.
diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv
@@ -613,9 +613,9 @@
 "testthat",">= 2.0.0","base/utils","Suggests",FALSE
 "testthat",">= 2.0.0","models/biocro","Suggests",FALSE
 "testthat",">= 2.0.0","modules/benchmark","Suggests",FALSE
-"testthat",">= 2.0.0","modules/data.atmosphere","Suggests",FALSE
 "testthat",">= 3.0.0","models/sibcasa","Suggests",FALSE
 "testthat",">= 3.0.4","base/qaqc","Suggests",FALSE
+"testthat",">= 3.1.7","modules/data.atmosphere","Suggests",FALSE
 "tibble","*","base/db","Imports",FALSE
 "tibble","*","models/ed","Imports",FALSE
 "tibble","*","models/fates","Imports",FALSE

diff --git a/modules/data.atmosphere/DESCRIPTION b/modules/data.atmosphere/DESCRIPTION
@@ -69,7 +69,7 @@ Suggests:
     PEcAn.settings,
     progress,
     reticulate,
-    testthat (>= 2.0.0),
+    testthat (>= 3.1.7),
     withr
 Remotes:
     github::adokter/suntools,

diff --git a/modules/data.atmosphere/NEWS.md b/modules/data.atmosphere/NEWS.md
@@ -1,5 +1,8 @@
 # PEcAn.data.atmosphere 1.8.0.9000
 
+## Fixed
+
+* `download.AmerifluxLBL` no longer wrongly re-fetches raw zipfiles when `overwrite = FALSE`
 
 # PEcAn.data.atmosphere 1.8.0
 

diff --git a/modules/data.atmosphere/R/download.AmerifluxLBL.R b/modules/data.atmosphere/R/download.AmerifluxLBL.R
@@ -4,11 +4,11 @@
 ##' to download a zip-file of data. The zip-file is extracted to a csv-file that is stored
 ##' to the given outfolder. Details about amf_download_base function can be found here:
 ##' https://github.com/chuhousen/amerifluxr/blob/master/R/amf_download_base.R
-##' 
+##'
 ##' Uses Ameriflux LBL JSON API to download met data from Ameriflux towers in CSV format
-##' 
+##'
 ##' @export
-##' @param sitename the Ameriflux ID of the site to be downloaded, used as file name prefix. 
+##' @param sitename the Ameriflux ID of the site to be downloaded, used as file name prefix.
 ##' The 'SITE_ID' field in \href{http://ameriflux.lbl.gov/sites/site-list-and-pages/}{list of Ameriflux sites}
 ##' @param outfolder location on disk where outputs will be stored
 ##' @param start_date the start date of the data to be downloaded. Format is YYYY-MM-DD (will only use the year part of the date)
@@ -26,115 +26,118 @@
 ##' \dontrun{
 ##' result <- download.AmerifluxLBL("US-Akn","~/","2011-01-01","2011-12-31",overwrite=TRUE)
 ##' }
-##' 
+##'
 ##' @author Ankur Desai, Henri Kajasilta based on download.Ameriflux.R by Josh Mantooth, Rob Kooper, Shawn Serbin
 
 
 download.AmerifluxLBL <- function(sitename, outfolder, start_date, end_date,
                                   overwrite = FALSE, verbose = FALSE, username = "pecan", method,
                                   useremail = "@", data_product = "BASE-BADM", data_policy = "CCBY4.0", ...) {
-  
+
   # Initial set-ups for amerifluxr packages
   # get start/end year code works on whole years only
   start_date <- as.POSIXlt(start_date, tz = "UTC")
   end_date <- as.POSIXlt(end_date, tz = "UTC")
-  
+
   start_year <- lubridate::year(start_date)
   end_year <- lubridate::year(end_date)
-  
-  
+
+
   site <- sub(".* \\((.*)\\)", "\\1", sitename)
-
-
-
-
-
+
+
   # make sure output folder exists
   if (!file.exists(outfolder)) {
     dir.create(outfolder, showWarnings = FALSE, recursive = TRUE)
   }
-
-
-
-  repeat {
-    tout <- options("timeout")
-    zip_file <- try(amerifluxr::amf_download_base(user_id = username,
-                                                  user_email = useremail,
-                                                  site_id = site,
-                                                  data_product = data_product,
-                                                  data_policy = data_policy,
-                                                  agree_policy = TRUE,
-                                                  intended_use = "model",
-                                                  intended_use_text = "PEcAn download",
-                                                  verbose = verbose,
-                                                  out_dir = outfolder)
-    )
-    if (!inherits(zip_file, "try-error")){
-      break
-    }else if(tout$timeout > 250 ){
-      PEcAn.logger::logger.severe("Download takes too long, check your connection.")
-      break
+
+  version <- amerifluxr::amf_var_info()
+  version <- unique(version[version$Site_ID == site,]$BASE_Version)
+  if (length(version) != 1) {
+    PEcAn.logger::logger.error("Could not find AmerifluxLBL version info for site", site)
+  }
+  expected_filename <- paste0("AMF_", site, "_", data_product, "_", version, ".zip")
+  expected_fullpath <- file.path(outfolder, expected_filename)
+
+  if (!overwrite && file.exists(expected_fullpath)) {
+    PEcAn.logger::logger.debug("File '", expected_filename, "' already exists, skipping download")
+    zip_file <- expected_fullpath
+  } else {
+    repeat {
+      tout <- getOption("timeout")
+      zip_file <- try(
+        amerifluxr::amf_download_base(
+          user_id = username,
+          user_email = useremail,
+          site_id = site,
+          data_product = data_product,
+          data_policy = data_policy,
+          agree_policy = TRUE,
+          intended_use = "model",
+          intended_use_text = "PEcAn download",
+          verbose = verbose,
+          out_dir = outfolder)
+      )
+      if (!inherits(zip_file, "try-error")){
+        break
+      }else if(tout > 250 ){
+        PEcAn.logger::logger.severe("Download takes too long, check your connection.")
+        break
+      }
+      PEcAn.logger::logger.info("Added 100 seconds before the download timeouts")
+      options(timeout = tout + 100)
     }
-    PEcAn.logger::logger.info("Added 100 seconds before the download timeouts")
-    options(timeout = tout$timeout + 100)
   }
-
-
-
-
+
+
   # Path to created zip-file
-  ftplink <- zip_file
-  if(!grepl(".zip", ftplink)){
-    PEcAn.logger::logger.info("Not able to download a zip-file. Check download.AmerifluxLBL inputs")
+  if(!grepl(".zip", zip_file)){
+    PEcAn.logger::logger.error("Not able to download a zip-file. Check download.AmerifluxLBL inputs")
   }
-  
+
   # get zip and csv filenames
-  outfname <- strsplit(ftplink, "/")
-  outfname <- outfname[[1]][length(outfname[[1]])]
-  output_zip_file <- file.path(outfolder, outfname)
+  outfname <- basename(zip_file)
+
+  if (outfname != expected_filename) {
+    PEcAn.logger::logger.info(
+      "Downloaded a file named", sQuote(outfname),
+      "but download.AmerifluxLBL was expecting", sQuote(expected_filename), ". This may be a PEcAn bug.")
+  }
+
   file_timestep_hh <- "HH"
   file_timestep_hr <- "HR"
   file_timestep <- file_timestep_hh
-
+
+  # TODO: Are values of data_product other than "BASE-BADM" actually supported?
+  # If not, this whole block reduces to
+  #   outcsvname <- sub("-BADM(.*).zip", "_HH\\1.csv", outfname)
+  # And: if other values _are_ supported, are the fixed-length substrings below
+  #   still correct?
   endname <- strsplit(outfname, "_")
   endname <- endname[[1]][length(endname[[1]])]
-  endname <- gsub("\\..*", "", endname) 
+  endname <- gsub("\\..*", "", endname)
   outcsvname <- paste0(substr(outfname, 1, 15), "_", file_timestep_hh, "_", endname, ".csv")
   output_csv_file <- file.path(outfolder, outcsvname)
   outcsvname_hr <- paste0(substr(outfname, 1, 15), "_", file_timestep_hr, "_", endname, ".csv")
   output_csv_file_hr <- file.path(outfolder, outcsvname_hr)
-
-  download_file_flag     <- TRUE
+
   extract_file_flag      <- TRUE
-  if (!overwrite && file.exists(output_zip_file)) {
-    PEcAn.logger::logger.debug("File '", output_zip_file, "' already exists, skipping download")
-    download_file_flag   <- FALSE
-  }
   if (!overwrite && file.exists(output_csv_file)) {
     PEcAn.logger::logger.debug("File '", output_csv_file, "' already exists, skipping extraction.")
-    download_file_flag   <- FALSE
     extract_file_flag    <- FALSE
     file_timestep        <- "HH"
   } else {
     if (!overwrite && file.exists(output_csv_file_hr)) {
       PEcAn.logger::logger.debug("File '", output_csv_file_hr, "' already exists, skipping extraction.")
-      download_file_flag <- FALSE
       extract_file_flag  <- FALSE
       file_timestep      <- "HR"
       outcsvname         <- outcsvname_hr
       output_csv_file    <- output_csv_file_hr
     }
   }
-
-  if (download_file_flag) {
-    extract_file_flag <- TRUE
-    PEcAn.utils::download_file(ftplink, output_zip_file, method)
-    if (!file.exists(output_zip_file)) {
-      PEcAn.logger::logger.severe("FTP did not download ", output_zip_file, " from ", ftplink)
-    }
-  }
+
   if (extract_file_flag) {
-    avail_file <- utils::unzip(output_zip_file, list = TRUE)
+    avail_file <- utils::unzip(zip_file, list = TRUE)
     if (length(grep("HH", avail_file)) > 0) {
       file_timestep <- "HH"
     } else {
@@ -143,62 +146,62 @@ download.AmerifluxLBL <- function(sitename, outfolder, start_date, end_date,
         output_csv_file <- output_csv_file_hr
         outcsvname <- outcsvname_hr
       } else {
-        PEcAn.logger::logger.severe("Half-hourly or Hourly data file was not found in ", output_zip_file)
+        PEcAn.logger::logger.severe("Half-hourly or Hourly data file was not found in ", zip_file)
       }
     }
-    utils::unzip(output_zip_file, outcsvname, exdir = outfolder)
+    utils::unzip(zip_file, outcsvname, exdir = outfolder)
     if (!file.exists(output_csv_file)) {
-      PEcAn.logger::logger.severe("ZIP file ", output_zip_file, " did not contain CSV file ", outcsvname)
+      PEcAn.logger::logger.severe("ZIP file ", zip_file, " did not contain CSV file ", outcsvname)
     }
   }
-  
+
   dbfilename <- paste0(substr(outfname, 1, 15), "_", file_timestep, "_", endname)
-  
+
   # get start and end year of data from file
   firstline <- system(paste0("head -4 ", output_csv_file), intern = TRUE)
   firstline <- firstline[4]
   lastline <- system(paste0("tail -1 ", output_csv_file), intern = TRUE)
-  
-  firstdate_st <- paste0(substr(firstline, 1, 4), "-", 
+
+  firstdate_st <- paste0(substr(firstline, 1, 4), "-",
                          substr(firstline, 5, 6), "-",
                          substr(firstline, 7, 8), " ",
                          substr(firstline, 9, 10), ":",
                          substr(firstline, 11, 12))
   firstdate <- as.POSIXlt(firstdate_st)
-  lastdate_st <- paste0(substr(lastline, 1, 4), "-", 
-                        substr(lastline, 5, 6), "-", 
-                        substr(lastline, 7, 8), " ", 
-                        substr(lastline, 9, 10), ":", 
+  lastdate_st <- paste0(substr(lastline, 1, 4), "-",
+                        substr(lastline, 5, 6), "-",
+                        substr(lastline, 7, 8), " ",
+                        substr(lastline, 9, 10), ":",
                         substr(lastline, 11, 12))
   lastdate <- as.POSIXlt(lastdate_st)
-  
+
   syear <- lubridate::year(firstdate)
   eyear <- lubridate::year(lastdate)
-  
+
   if (start_year > eyear) {
     PEcAn.logger::logger.severe("Start_Year", start_year, "exceeds end of record ", eyear, " for ", site)
   }
   if (end_year < syear) {
     PEcAn.logger::logger.severe("End_Year", end_year, "precedes start of record ", syear, " for ", site)
   }
-  
+
   rows    <- 1
-  results <- data.frame(file = character(rows), 
-                        host = character(rows), 
-                        mimetype = character(rows), 
-                        formatname = character(rows), 
-                        startdate = character(rows), 
-                        enddate = character(rows), 
-                        dbfile.name = dbfilename, 
+  results <- data.frame(file = character(rows),
+                        host = character(rows),
+                        mimetype = character(rows),
+                        formatname = character(rows),
+                        startdate = character(rows),
+                        enddate = character(rows),
+                        dbfile.name = dbfilename,
                         stringsAsFactors = FALSE)
-  
+
   results$file[rows]       <- output_csv_file
   results$host[rows]       <- PEcAn.remote::fqdn()
   results$startdate[rows]  <- firstdate_st
   results$enddate[rows]    <- lastdate_st
   results$mimetype[rows]   <- "text/csv"
   results$formatname[rows] <- "AMERIFLUX_BASE_HH"
-  
+
   # return list of files downloaded
   return(results)
 } # download.AmerifluxLBL
diff --git a/modules/data.atmosphere/man/download.AmerifluxLBL.Rd b/modules/data.atmosphere/man/download.AmerifluxLBL.Rd
diff --git a/modules/data.atmosphere/tests/testthat/test.download.AmerifluxLBL.R b/modules/data.atmosphere/tests/testthat/test.download.AmerifluxLBL.R
@@ -0,0 +1,76 @@
+
+test_that("download respects overwrite argument", {
+	outdir <- withr::local_tempdir()
+	zippath <- file.path(outdir, "AMF_US-Akn_BASE-BADM_6-5.zip")
+	csvpath <- sub("-BADM(.*).zip", "_HH\\1.csv", zippath)
+
+	# Mock out amerifluxr functions to test our code without network calls
+	local_mocked_bindings(
+		amf_download_base = \(...) {
+			tmp_csv <- basename(csvpath)
+			withr::with_tempdir({
+				writeLines(
+					c(  "# fake file",
+						"#",
+						"TIMESTAMP_START,TIMESTAMP_END",
+						"201101010000,201101010030",
+						"201112310000,201112310030"),
+					tmp_csv)
+				zip(zippath, tmp_csv, flags = "-qr0X")})
+			zippath
+		},
+		amf_var_info = \(...) data.frame(
+			Site_ID = "US-Akn",
+			BASE_Version = "6-5"),
+		.package = "amerifluxr"
+	)
+
+	# wrapper, just to skip retyping args
+	dl_akn <- function(...) download.AmerifluxLBL(
+		site = "US-Akn",
+		outfolder = outdir,
+		start_date = "2011-01-01",
+		end_date = "2011-10-01",
+		...)
+
+	# Case 0: new download
+	expect_false(file.exists(zippath))
+	expect_false(file.exists(csvpath))
+	dl_akn()
+	expect_true(file.exists(zippath))
+	expect_true(file.exists(csvpath))
+
+
+	# Case 1: reuse existing download
+	ziptime <- file.mtime(zippath)
+	csvtime <- file.mtime(csvpath)
+	expect_log(
+		dl_akn(overwrite = FALSE),
+		"skipping download.*skipping extraction")
+	expect_equal(file.mtime(zippath), ziptime)
+	expect_equal(file.mtime(csvpath), csvtime)
+
+	# Case 2: overwrite existing download
+	dl_akn(overwrite = TRUE)
+	expect_gt(file.mtime(zippath), ziptime)
+	expect_gt(file.mtime(csvpath), csvtime)
+
+	# Case 3: Freshen csv without clobbering zip
+	file.remove(csvpath)
+	ziptime <- file.mtime(zippath)
+	expect_log(dl_akn(overwrite = FALSE), "skipping download")
+	expect_true(file.exists(csvpath))
+	expect_equal(file.mtime(zippath), ziptime)
+
+	# Case 4: Re-download zip without clobbering CSV
+	# (Note: I'm not sure this is desirable! For consistency it may be better
+	#	to overwrite the CSV so we know it matches the zip file.
+	# If you change the behavior, go ahead and update this test to match.)
+	file.remove(zippath)
+	csvtime <- file.mtime(csvpath)
+	expect_log(
+		dl_akn(overwrite = FALSE),
+		"skipping extraction")
+	expect_true(file.exists(zippath))
+	expect_equal(file.mtime(csvpath), csvtime)
+})