Skip to content

Commit

Permalink
fix report when there are NA, speed up some parts
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelbosch committed Sep 21, 2018
1 parent 67dd513 commit b848a47
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 55 deletions.
6 changes: 2 additions & 4 deletions R/check_eventdate.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ check_eventdate <- function(data) {
if (!"eventDate" %in% names(data)) {
return(data_frame(
level = "error",
message = "Column eventDate missing",
stringsAsFactors = FALSE
message = "Column eventDate missing"
))
}

Expand All @@ -59,8 +58,7 @@ check_eventdate <- function(data) {
level = "error",
row = rows,
field = "eventDate",
message = paste0("eventDate ", data$eventDate[rows], " does not seem to be a valid date"),
stringsAsFactors = FALSE
message = paste0("eventDate ", data$eventDate[rows], " does not seem to be a valid date")
))
}

Expand Down
12 changes: 4 additions & 8 deletions R/check_fields.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ check_fields <- function(data, level = "error") {
level = "error",
field = fields,
row = NA,
message = paste0("Required field ", fields, " is missing"),
stringsAsFactors = FALSE
message = paste0("Required field ", fields, " is missing")
))
}

Expand All @@ -36,8 +35,7 @@ check_fields <- function(data, level = "error") {
level = "error",
field = field,
row = which(rows),
message = paste0("Empty value for required field ", field),
stringsAsFactors = FALSE
message = paste0("Empty value for required field ", field)
))
}
}
Expand All @@ -54,8 +52,7 @@ check_fields <- function(data, level = "error") {
errors <- bind_rows(errors, data_frame(
field = fields,
level = "warning",
message = paste0("Recommended field ", fields, " is missing"),
stringsAsFactors = FALSE
message = paste0("Recommended field ", fields, " is missing")
))
}

Expand All @@ -69,8 +66,7 @@ check_fields <- function(data, level = "error") {
level = "warning",
field = field,
row = which(rows),
message = paste0("Empty value for recommended field ", field),
stringsAsFactors = FALSE
message = paste0("Empty value for recommended field ", field)
))
}
}
Expand Down
34 changes: 22 additions & 12 deletions R/check_outliers.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
#' by in order to determine the range of valid values. Default is \code{6}.
#' @param iqr_coef Coefficient to multiply the interquartile range (IQR) by in
#' order to determine the range of valid values. Default values is \code{3}.
#'
#' @param topn Number of species for which the QC should be performed. Default
#' is \code{NA} (all species).
#' @return Problematic records or an errors report.
#' @examples
#' \dontrun{
Expand All @@ -24,7 +25,7 @@
#' @seealso \code{\link{plot_outliers}} \code{\link{check_outliers_dataset}}
#' \code{\link{check_onland}} \code{\link{check_depth}}
#' @export
check_outliers_species <- function(data, report = FALSE, mad_coef = 6, iqr_coef = 3) {
check_outliers_species <- function(data, report = FALSE, mad_coef = 6, iqr_coef = 3, topn = NA) {
errors <- check_lonlat(data, report = TRUE)
if (!'scientificNameID' %in% colnames(data)) {
errors <- rbind(errors, data_frame(level = 'error', message = 'Column scientificNameID missing'))
Expand All @@ -43,7 +44,12 @@ check_outliers_species <- function(data, report = FALSE, mad_coef = 6, iqr_coef
stop(paste(errors, collapse = ", "))
}
result <- NULL
taxa <- unique(na.omit(aphiaids))

taxa <- table(na.omit(aphiaids))
if(!is.na(topn) & !is.null(topn) & topn > 0 & topn < length(taxa)) {
taxa <- sort(taxa, decreasing = TRUE)[1:topn]
}
taxa <- as.integer(names(taxa))
for(taxon in taxa) {
return_values = report # values are practical for creating the report
istaxon <- !is.na(aphiaids) & aphiaids == taxon
Expand Down Expand Up @@ -179,9 +185,9 @@ plot_outliers_environmental <- function(outliers_info, title = '', sample_outlie
upper = ifelse(is.null(o$q3) || is.na(o$q3), NA, o$q3) )
databox <- databox[complete.cases(databox),]
datapoints <- data_frame(Statistic=c(rep('MAD', n) , rep('IQR', n)),
Ok = c(o$ok_mad, o$ok_iqr),
Value=rep(o$values, 2), stringsAsFactors = TRUE)
datapoints <- datapoints %>% filter(!is.na(Value) & (as.character(Statistic) %in% as.character(databox$Statistic)) & !Ok)
Ok = c(o$ok_mad, o$ok_iqr),
Value=rep(o$values, 2))
datapoints <- datapoints %>% filter(!is.na(Ok) & !is.na(Value) & (as.character(Statistic) %in% as.character(databox$Statistic)) & !Ok)
if(nrow(databox) > 0 & nrow(datapoints) > 0) {
if(!is.null(sample_outliers) & !is.na(sample_outliers) & (sample_outliers < nrow(datapoints)) & sample_outliers > 10) {
datapoints <- sample_n(datapoints, sample_outliers)
Expand All @@ -208,15 +214,19 @@ plot_outliers_environmental <- function(outliers_info, title = '', sample_outlie

plot_outliers_spatial <- function(outliers_info, title='', sample_okpoints=1000) {
o <- outliers_info[['spatial']]
data <- as_data_frame(o$xy)
if (nrow(data[!o$ok_iqr | !o$ok_mad,]) > 0){
ok_na <- is.na(o$ok_iqr) | is.na(o$ok_mad)
ok_iqr <- o$ok_iqr[!ok_na]
ok_mad <- o$ok_mad[!ok_na]
data <- as_data_frame(o$xy[!ok_na,])

if (nrow(data[!ok_iqr | !ok_mad,]) > 0){
okcolor <- '#1b9e77'
data[,'color'] <- okcolor
data[,'radius'] <- 3.5
data[!o$ok_iqr & o$ok_mad, 'color'] <- '#d95f02'
data[!o$ok_iqr & !o$ok_mad, 'color'] <- '#e7298a'
data[o$ok_iqr & !o$ok_mad, 'color'] <- '#7570b3'
data <- unique(data)
data[!ok_iqr & ok_mad, 'color'] <- '#d95f02'
data[!ok_iqr & !ok_mad, 'color'] <- '#e7298a'
data[ok_iqr & !ok_mad, 'color'] <- '#7570b3'
data <- unique(data[complete.cases(data),])
if(!is.null(sample_okpoints) & !is.na(sample_okpoints) & sample_okpoints > 10 & nrow(data) > sample_okpoints) {
whichok <- which(data$color == okcolor)
if(length(whichok) > sample_okpoints) {
Expand Down
43 changes: 24 additions & 19 deletions R/plot_map.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,34 @@
#' @export
plot_map <- function(data, zoom = FALSE) {
check_lonlat(data, FALSE)
world <- borders("world", colour="gray80", fill="gray80")
m <- ggplot() +
world +
geom_point(data = data, aes(x = decimalLongitude, y = decimalLatitude), size = 2, stroke = 1, alpha = 0.3, colour = "#FF368B") +
xlab("longitude") +
ylab("latitude")
m <- NULL
data <- get_xy_clean(data)
if(NROW(data) > 0) {
world <- borders("world", colour="gray80", fill="gray80")
m <- ggplot() +
world +
geom_point(data = data, aes(x = decimalLongitude, y = decimalLatitude), size = 2, stroke = 1, alpha = 0.3, colour = "#FF368B") +
xlab("longitude") +
ylab("latitude")

xrange <- range(data$decimalLongitude, na.rm = TRUE)
yrange <- range(data$decimalLatitude, na.rm = TRUE)
xrange <- range(data$decimalLongitude, na.rm = TRUE)
yrange <- range(data$decimalLatitude, na.rm = TRUE)

if (zoom & all(is.finite(xrange)) & all(is.finite(yrange))) {
margin <- 0.3
dx <- margin * (xrange[2] - xrange[1])
dy <- margin * (yrange[2] - yrange[1])
xrange[1] <- xrange[1] - dx
xrange[2] <- xrange[2] + dx
yrange[1] <- yrange[1] - dy
yrange[2] <- yrange[2] + dy
m <- m + coord_quickmap(xlim = xrange, ylim = yrange)
if (zoom & all(is.finite(xrange)) & all(is.finite(yrange))) {
margin <- 0.3
dx <- margin * (xrange[2] - xrange[1])
dy <- margin * (yrange[2] - yrange[1])
xrange[1] <- xrange[1] - dx
xrange[2] <- xrange[2] + dx
yrange[1] <- yrange[1] - dy
yrange[2] <- yrange[2] + dy
m <- m + coord_quickmap(xlim = xrange, ylim = yrange)
} else {
m <- m + coord_quickmap()
}
} else {
m <- m + coord_quickmap()
warning("No valid coordinates found for plotting")
}

return(m)

}
Expand Down
7 changes: 4 additions & 3 deletions R/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@ report_summary <- function(qcreport, maxrows) {
#' \code{rappdirs::user_cache_dir("obistools")}).
#' @param view Logical, show the report in a browser after creation (default
#' \code{TRUE}).
#' @param topnspecies Integer, number of species ordered by number of records
#' for which you want to do the outlier analysis
#' @return Returns the full path to the generated html report.
#' @examples
#' \dontrun{
#' report(abra)
#' }
#' @export
report <- function(data, qc = NULL, file = "report.html", dir = NULL, view = TRUE) {
report <- function(data, qc = NULL, file = "report.html", dir = NULL, view = TRUE, topnspecies = 20) {

reportfile <- system.file("", "report.Rmd", package = "obistools")

Expand All @@ -51,7 +53,7 @@ report <- function(data, qc = NULL, file = "report.html", dir = NULL, view = TRU
check_onland(data, report = TRUE),
check_depth(data, report = TRUE),
check_outliers_dataset(data, report = TRUE),
check_outliers_species(data, report = TRUE)
check_outliers_species(data, report = TRUE, topn = topnspecies)
)
qc <- distinct(qc)
}
Expand All @@ -63,4 +65,3 @@ report <- function(data, qc = NULL, file = "report.html", dir = NULL, view = TRU
}
return(outputfile)
}

31 changes: 23 additions & 8 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,38 +39,53 @@ check_lonlat <- function(data, report) {
}
if(length(errors) > 0) {
if(report) {
return(data.frame(level = "error", message = errors, stringsAsFactors = FALSE))
return(data_frame(level = "error", message = errors))
} else {
stop(paste(errors, collapse = ", "))
}
}
return(NULL)
}

get_xy_clean_duplicates <- function(data) {

get_xy_clean <- function(data, returnisclean=FALSE) {
check_lonlat(data, report = FALSE)
sp <- data.frame(decimalLongitude = numeric(0), decimalLatitude = numeric(0))
isclean <- NULL
if(NROW(data) > 0) {
sp <- data %>% select(decimalLongitude, decimalLatitude)
# Only lookup values for valid coordinates
# Only valid coordinates
isclean <- stats::complete.cases(sp) &
sapply(sp$decimalLongitude, is.numeric) &
sapply(sp$decimalLatitude, is.numeric) &
!is.na(sp$decimalLongitude) & !is.na(sp$decimalLatitude) &
sp$decimalLongitude >= -180.0 & sp$decimalLongitude <= 180.0 &
sp$decimalLatitude >= -90.0 & sp$decimalLatitude <= 90.0
cleansp <- sp[isclean,,drop=FALSE]
}
cleansp <- sp[isclean,,drop=FALSE]
if(returnisclean) {
return(list(cleansp=cleansp, isclean=isclean))
} else {
return(cleansp)
}
}

get_xy_clean_duplicates <- function(data) {
clean <- get_xy_clean(data, returnisclean = TRUE)
if(NROW(clean$cleansp) > 0) {
# Only lookup values for unique coordinates
key <- paste(cleansp$decimalLongitude, cleansp$decimalLatitude, sep='\r')
key <- paste(clean$cleansp$decimalLongitude, clean$cleansp$decimalLatitude, sep='\r')
notdup <- !duplicated(key)
uniquesp <- cleansp[notdup,]
uniquesp <- clean$cleansp[notdup,]
duplicated_lookup <- match(key, key[notdup])
list(uniquesp=uniquesp, isclean=isclean, duplicated_lookup=duplicated_lookup)
list(uniquesp=uniquesp, isclean=clean$isclean, duplicated_lookup=duplicated_lookup)
} else {
list(uniquesp = data.frame(decimalLongitude = numeric(0), decimalLatitude = numeric(0)),
isclean = NULL, duplicated_lookup = NULL)
}
}


list_cache <- function() {
list.files(rappdirs::user_cache_dir("obistools"), "call_", full.names = TRUE)
}
Expand All @@ -89,7 +104,7 @@ cache_call <- function(key, expr, env = NULL) {
}
cache_dir <- rappdirs::user_cache_dir("obistools")
cachefile <- file.path(cache_dir, paste0("call_", digest::digest(list(key=key, expr=expr)), ".rds"))
if(file.exists(cachefile) && difftime(Sys.time(), file.info(cachefile)[,"mtime"], units = "hours") < 36) {
if(file.exists(cachefile) && difftime(Sys.time(), file.info(cachefile)[,"mtime"], units = "hours") < 10) {
return(readRDS(cachefile))
} else {
result <- eval(expr, envir = NULL, enclos = env)
Expand Down
3 changes: 3 additions & 0 deletions man/check_outliers_species.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/report.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b848a47

Please sign in to comment.