diff --git a/R/check_eventdate.R b/R/check_eventdate.R index 5627a08..3827516 100644 --- a/R/check_eventdate.R +++ b/R/check_eventdate.R @@ -45,8 +45,7 @@ check_eventdate <- function(data) { if (!"eventDate" %in% names(data)) { return(data_frame( level = "error", - message = "Column eventDate missing", - stringsAsFactors = FALSE + message = "Column eventDate missing" )) } @@ -59,8 +58,7 @@ check_eventdate <- function(data) { level = "error", row = rows, field = "eventDate", - message = paste0("eventDate ", data$eventDate[rows], " does not seem to be a valid date"), - stringsAsFactors = FALSE + message = paste0("eventDate ", data$eventDate[rows], " does not seem to be a valid date") )) } diff --git a/R/check_fields.R b/R/check_fields.R index ed4ec3b..4f8f2ff 100644 --- a/R/check_fields.R +++ b/R/check_fields.R @@ -21,8 +21,7 @@ check_fields <- function(data, level = "error") { level = "error", field = fields, row = NA, - message = paste0("Required field ", fields, " is missing"), - stringsAsFactors = FALSE + message = paste0("Required field ", fields, " is missing") )) } @@ -36,8 +35,7 @@ check_fields <- function(data, level = "error") { level = "error", field = field, row = which(rows), - message = paste0("Empty value for required field ", field), - stringsAsFactors = FALSE + message = paste0("Empty value for required field ", field) )) } } @@ -54,8 +52,7 @@ check_fields <- function(data, level = "error") { errors <- bind_rows(errors, data_frame( field = fields, level = "warning", - message = paste0("Recommended field ", fields, " is missing"), - stringsAsFactors = FALSE + message = paste0("Recommended field ", fields, " is missing") )) } @@ -69,8 +66,7 @@ check_fields <- function(data, level = "error") { level = "warning", field = field, row = which(rows), - message = paste0("Empty value for recommended field ", field), - stringsAsFactors = FALSE + message = paste0("Empty value for recommended field ", field) )) } } diff --git a/R/check_outliers.R b/R/check_outliers.R index de2b3ae..d3ad80e 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -10,7 +10,8 @@ #' by in order to determine the range of valid values. Default is \code{6}. #' @param iqr_coef Coefficient to multiply the interquartile range (IQR) by in #' order to determine the range of valid values. Default values is \code{3}. -#' +#' @param topn Number of species for which the QC should be performed. Default +#' is \code{NA} (all species). #' @return Problematic records or an errors report. #' @examples #' \dontrun{ @@ -24,7 +25,7 @@ #' @seealso \code{\link{plot_outliers}} \code{\link{check_outliers_dataset}} #' \code{\link{check_onland}} \code{\link{check_depth}} #' @export -check_outliers_species <- function(data, report = FALSE, mad_coef = 6, iqr_coef = 3) { +check_outliers_species <- function(data, report = FALSE, mad_coef = 6, iqr_coef = 3, topn = NA) { errors <- check_lonlat(data, report = TRUE) if (!'scientificNameID' %in% colnames(data)) { errors <- rbind(errors, data_frame(level = 'error', message = 'Column scientificNameID missing')) @@ -43,7 +44,12 @@ check_outliers_species <- function(data, report = FALSE, mad_coef = 6, iqr_coef stop(paste(errors, collapse = ", ")) } result <- NULL - taxa <- unique(na.omit(aphiaids)) + + taxa <- table(na.omit(aphiaids)) + if(!is.na(topn) & !is.null(topn) & topn > 0 & topn < length(taxa)) { + taxa <- sort(taxa, decreasing = TRUE)[1:topn] + } + taxa <- as.integer(names(taxa)) for(taxon in taxa) { return_values = report # values are practical for creating the report istaxon <- !is.na(aphiaids) & aphiaids == taxon @@ -179,9 +185,9 @@ plot_outliers_environmental <- function(outliers_info, title = '', sample_outlie upper = ifelse(is.null(o$q3) || is.na(o$q3), NA, o$q3) ) databox <- databox[complete.cases(databox),] datapoints <- data_frame(Statistic=c(rep('MAD', n) , rep('IQR', n)), - Ok = c(o$ok_mad, o$ok_iqr), - Value=rep(o$values, 2), stringsAsFactors = TRUE) - datapoints <- datapoints %>% filter(!is.na(Value) & (as.character(Statistic) %in% as.character(databox$Statistic)) & !Ok) + Ok = c(o$ok_mad, o$ok_iqr), + Value=rep(o$values, 2)) + datapoints <- datapoints %>% filter(!is.na(Ok) & !is.na(Value) & (as.character(Statistic) %in% as.character(databox$Statistic)) & !Ok) if(nrow(databox) > 0 & nrow(datapoints) > 0) { if(!is.null(sample_outliers) & !is.na(sample_outliers) & (sample_outliers < nrow(datapoints)) & sample_outliers > 10) { datapoints <- sample_n(datapoints, sample_outliers) @@ -208,15 +214,19 @@ plot_outliers_environmental <- function(outliers_info, title = '', sample_outlie plot_outliers_spatial <- function(outliers_info, title='', sample_okpoints=1000) { o <- outliers_info[['spatial']] - data <- as_data_frame(o$xy) - if (nrow(data[!o$ok_iqr | !o$ok_mad,]) > 0){ + ok_na <- is.na(o$ok_iqr) | is.na(o$ok_mad) + ok_iqr <- o$ok_iqr[!ok_na] + ok_mad <- o$ok_mad[!ok_na] + data <- as_data_frame(o$xy[!ok_na,]) + + if (nrow(data[!ok_iqr | !ok_mad,]) > 0){ okcolor <- '#1b9e77' data[,'color'] <- okcolor data[,'radius'] <- 3.5 - data[!o$ok_iqr & o$ok_mad, 'color'] <- '#d95f02' - data[!o$ok_iqr & !o$ok_mad, 'color'] <- '#e7298a' - data[o$ok_iqr & !o$ok_mad, 'color'] <- '#7570b3' - data <- unique(data) + data[!ok_iqr & ok_mad, 'color'] <- '#d95f02' + data[!ok_iqr & !ok_mad, 'color'] <- '#e7298a' + data[ok_iqr & !ok_mad, 'color'] <- '#7570b3' + data <- unique(data[complete.cases(data),]) if(!is.null(sample_okpoints) & !is.na(sample_okpoints) & sample_okpoints > 10 & nrow(data) > sample_okpoints) { whichok <- which(data$color == okcolor) if(length(whichok) > sample_okpoints) { diff --git a/R/plot_map.R b/R/plot_map.R index 1d82c27..ac4f464 100644 --- a/R/plot_map.R +++ b/R/plot_map.R @@ -8,29 +8,34 @@ #' @export plot_map <- function(data, zoom = FALSE) { check_lonlat(data, FALSE) - world <- borders("world", colour="gray80", fill="gray80") - m <- ggplot() + - world + - geom_point(data = data, aes(x = decimalLongitude, y = decimalLatitude), size = 2, stroke = 1, alpha = 0.3, colour = "#FF368B") + - xlab("longitude") + - ylab("latitude") + m <- NULL + data <- get_xy_clean(data) + if(NROW(data) > 0) { + world <- borders("world", colour="gray80", fill="gray80") + m <- ggplot() + + world + + geom_point(data = data, aes(x = decimalLongitude, y = decimalLatitude), size = 2, stroke = 1, alpha = 0.3, colour = "#FF368B") + + xlab("longitude") + + ylab("latitude") - xrange <- range(data$decimalLongitude, na.rm = TRUE) - yrange <- range(data$decimalLatitude, na.rm = TRUE) + xrange <- range(data$decimalLongitude, na.rm = TRUE) + yrange <- range(data$decimalLatitude, na.rm = TRUE) - if (zoom & all(is.finite(xrange)) & all(is.finite(yrange))) { - margin <- 0.3 - dx <- margin * (xrange[2] - xrange[1]) - dy <- margin * (yrange[2] - yrange[1]) - xrange[1] <- xrange[1] - dx - xrange[2] <- xrange[2] + dx - yrange[1] <- yrange[1] - dy - yrange[2] <- yrange[2] + dy - m <- m + coord_quickmap(xlim = xrange, ylim = yrange) + if (zoom & all(is.finite(xrange)) & all(is.finite(yrange))) { + margin <- 0.3 + dx <- margin * (xrange[2] - xrange[1]) + dy <- margin * (yrange[2] - yrange[1]) + xrange[1] <- xrange[1] - dx + xrange[2] <- xrange[2] + dx + yrange[1] <- yrange[1] - dy + yrange[2] <- yrange[2] + dy + m <- m + coord_quickmap(xlim = xrange, ylim = yrange) + } else { + m <- m + coord_quickmap() + } } else { - m <- m + coord_quickmap() + warning("No valid coordinates found for plotting") } - return(m) } diff --git a/R/report.R b/R/report.R index d81fe0a..3abf512 100644 --- a/R/report.R +++ b/R/report.R @@ -34,13 +34,15 @@ report_summary <- function(qcreport, maxrows) { #' \code{rappdirs::user_cache_dir("obistools")}). #' @param view Logical, show the report in a browser after creation (default #' \code{TRUE}). +#' @param topnspecies Integer, number of species ordered by number of records +#' for which you want to do the outlier analysis #' @return Returns the full path to the generated html report. #' @examples #' \dontrun{ #' report(abra) #' } #' @export -report <- function(data, qc = NULL, file = "report.html", dir = NULL, view = TRUE) { +report <- function(data, qc = NULL, file = "report.html", dir = NULL, view = TRUE, topnspecies = 20) { reportfile <- system.file("", "report.Rmd", package = "obistools") @@ -51,7 +53,7 @@ report <- function(data, qc = NULL, file = "report.html", dir = NULL, view = TRU check_onland(data, report = TRUE), check_depth(data, report = TRUE), check_outliers_dataset(data, report = TRUE), - check_outliers_species(data, report = TRUE) + check_outliers_species(data, report = TRUE, topn = topnspecies) ) qc <- distinct(qc) } @@ -63,4 +65,3 @@ report <- function(data, qc = NULL, file = "report.html", dir = NULL, view = TRU } return(outputfile) } - diff --git a/R/util.R b/R/util.R index 56f5b6a..937b01f 100644 --- a/R/util.R +++ b/R/util.R @@ -39,7 +39,7 @@ check_lonlat <- function(data, report) { } if(length(errors) > 0) { if(report) { - return(data.frame(level = "error", message = errors, stringsAsFactors = FALSE)) + return(data_frame(level = "error", message = errors)) } else { stop(paste(errors, collapse = ", ")) } @@ -47,30 +47,45 @@ check_lonlat <- function(data, report) { return(NULL) } -get_xy_clean_duplicates <- function(data) { + +get_xy_clean <- function(data, returnisclean=FALSE) { check_lonlat(data, report = FALSE) + sp <- data.frame(decimalLongitude = numeric(0), decimalLatitude = numeric(0)) + isclean <- NULL if(NROW(data) > 0) { sp <- data %>% select(decimalLongitude, decimalLatitude) - # Only lookup values for valid coordinates + # Only valid coordinates isclean <- stats::complete.cases(sp) & sapply(sp$decimalLongitude, is.numeric) & sapply(sp$decimalLatitude, is.numeric) & !is.na(sp$decimalLongitude) & !is.na(sp$decimalLatitude) & sp$decimalLongitude >= -180.0 & sp$decimalLongitude <= 180.0 & sp$decimalLatitude >= -90.0 & sp$decimalLatitude <= 90.0 - cleansp <- sp[isclean,,drop=FALSE] + } + cleansp <- sp[isclean,,drop=FALSE] + if(returnisclean) { + return(list(cleansp=cleansp, isclean=isclean)) + } else { + return(cleansp) + } +} + +get_xy_clean_duplicates <- function(data) { + clean <- get_xy_clean(data, returnisclean = TRUE) + if(NROW(clean$cleansp) > 0) { # Only lookup values for unique coordinates - key <- paste(cleansp$decimalLongitude, cleansp$decimalLatitude, sep='\r') + key <- paste(clean$cleansp$decimalLongitude, clean$cleansp$decimalLatitude, sep='\r') notdup <- !duplicated(key) - uniquesp <- cleansp[notdup,] + uniquesp <- clean$cleansp[notdup,] duplicated_lookup <- match(key, key[notdup]) - list(uniquesp=uniquesp, isclean=isclean, duplicated_lookup=duplicated_lookup) + list(uniquesp=uniquesp, isclean=clean$isclean, duplicated_lookup=duplicated_lookup) } else { list(uniquesp = data.frame(decimalLongitude = numeric(0), decimalLatitude = numeric(0)), isclean = NULL, duplicated_lookup = NULL) } } + list_cache <- function() { list.files(rappdirs::user_cache_dir("obistools"), "call_", full.names = TRUE) } @@ -89,7 +104,7 @@ cache_call <- function(key, expr, env = NULL) { } cache_dir <- rappdirs::user_cache_dir("obistools") cachefile <- file.path(cache_dir, paste0("call_", digest::digest(list(key=key, expr=expr)), ".rds")) - if(file.exists(cachefile) && difftime(Sys.time(), file.info(cachefile)[,"mtime"], units = "hours") < 36) { + if(file.exists(cachefile) && difftime(Sys.time(), file.info(cachefile)[,"mtime"], units = "hours") < 10) { return(readRDS(cachefile)) } else { result <- eval(expr, envir = NULL, enclos = env) diff --git a/man/check_outliers_species.Rd b/man/check_outliers_species.Rd index c514bfd..afa59c8 100644 --- a/man/check_outliers_species.Rd +++ b/man/check_outliers_species.Rd @@ -18,6 +18,9 @@ by in order to determine the range of valid values. Default is \code{6}.} \item{iqr_coef}{Coefficient to multiply the interquartile range (IQR) by in order to determine the range of valid values. Default values is \code{3}.} + +\item{topn}{Number of species for which the QC should be performed. Default +is \code{NA} (all species).} } \value{ Problematic records or an errors report. diff --git a/man/report.Rd b/man/report.Rd index 98357d4..09823be 100644 --- a/man/report.Rd +++ b/man/report.Rd @@ -4,7 +4,8 @@ \alias{report} \title{Creates a basic data quality report.} \usage{ -report(data, qc = NULL, file = "report.html", dir = NULL, view = TRUE) +report(data, qc = NULL, file = "report.html", dir = NULL, view = TRUE, + topnspecies = 20) } \arguments{ \item{data}{The data frame.} @@ -19,6 +20,9 @@ data.} \item{view}{Logical, show the report in a browser after creation (default \code{TRUE}).} + +\item{topnspecies}{Integer, number of species ordered by number of records +for which you want to do the outlier analysis} } \value{ Returns the full path to the generated html report.