Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chronogram database update #91

Merged
merged 16 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: datelife
Title: Scientific Data on Time of Lineage Divergence for Your Taxa
Version: 0.6.8
Version: 0.6.9
Maintainer: Luna L. Sanchez Reyes <[email protected]>
Authors@R: c(
person("Brian", "O'Meara", email = "[email protected]", role=c("aut")),
Expand Down Expand Up @@ -69,7 +69,8 @@ Suggests:
devtools,
covr,
msa,
Biostrings
Biostrings,
spelling
LazyDataCompression: xz
SystemRequirements: PATHd8
URL: https://github.com/phylotastic/datelife, http://phylotastic.org/datelife/
Expand All @@ -79,3 +80,4 @@ RoxygenNote: 7.2.3
Roxygen: list(markdown = TRUE)
Encoding: UTF-8
VignetteBuilder: knitr
Language: en-US
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@

DONE:
-->
# datelife v0.6.9
- fix bug in `check_ott_input()`
- fix bug in `make_datelife_query()` when getting ott ids
- use spaces instead of "_" to run tnrs, avoids bug from `rotl::tnrs_match_names()` v3.0.14
- faster and more accurate way to get study ids and tree ids from opentree API on `get_opentree_chronograms()` that have branch lengths in Myrs, no relative time.
- update to chronogram database, now with 292 chronograms.
- `get_taxon_summary()` now manages case when `datelife_result` is empty; throws warning instead of criptic error.
- better `testthat` suite for `datelife_search()` inner functions.


# datelife v0.6.8
- fix bug in function `extract_calibrations_phylo()`
- update messages in `calibrations_match()`
Expand Down
2 changes: 1 addition & 1 deletion R/calibrations_use.R
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ use_calibrations <- function(phy = NULL,
# enhance: add a check for calibrations object structure
if (!inherits(calibrations, "data.frame")) {
exit <- TRUE
msg2 <- "'calibrations' is NOT a data frame. Check this."
msg2 <- "'calibrations' is NOT a data frame. Correct this."
} else {
msg2 <- "'calibrations' is a data frame. You are good."
}
Expand Down
12 changes: 6 additions & 6 deletions R/datelife_query.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,20 @@ make_datelife_query <- function(input = c("Rhea americana", "Pterocnemia pennata
if (use_tnrs_global) {
# process names even if it's a "higher" taxon name:
cleaned_names_tnrs <- clean_tnrs(tnrs_match(input = cleaned_names),
remove_nonmatches = TRUE)
remove_nonmatches = FALSE)
# recover original names of invalid taxa and unmatched:
cleaned_names <- gsub("_", " ", cleaned_names)
ii <- !tolower(cleaned_names) %in% cleaned_names_tnrs$search_string
cleaned_names <- c(cleaned_names_tnrs$unique_name, cleaned_names[ii])
cleaned_names <- gsub(" ", "_", cleaned_names)
if (inherits(phy_new, "phylo")) {
if (is.null(phy_new$ott_ids)) {
# after some tests, decided to use rotl's method instead of taxize::gnr_resolve, and just output the original input and the actual query for users to check out.
# cleaned_names <- taxize::gnr_resolve(names = cleaned_names, data_source_ids=179, fields="all")$matched_name
# rename the tip labels with tnrs matched names
# rename phy tip labels with TNRS matched names:
ott_ids <- rep(NA, length(cleaned_names))
ii <- match(cleaned_names_tnrs$search_string,
tolower(phy_new$tip.label))
phy_new$tip.label[ii] <- cleaned_names[ii]
ii <- match(gsub(" ", "_", cleaned_names_tnrs$search_string),
tolower(gsub(" ", "_", phy_new$tip.label)))
phy_new$tip.label[ii] <- gsub(" ", "_", cleaned_names)[ii]
ott_ids[ii] <- cleaned_names_tnrs$ott_id
phy_new$ott_ids <- ott_ids
}
Expand Down
7 changes: 4 additions & 3 deletions R/datelife_result.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ get_datelife_result_datelifequery <- function(datelife_query = NULL,
stop("'datelife_query' must be a 'datelifeQuery' object.")
}
if (length(datelife_query$cleaned_names) == 1) {
message("Can't get divergence times from just one taxon in 'datelife_query$cleaned_names'.")
message("Can't get divergence times from just one taxon available in 'datelife_query$cleaned_names'.")
message("Making a 'datelifeQuery' again, setting 'get_spp_from_taxon = TRUE'.")
datelife_query <- make_datelife_query(input = datelife_query$cleaned_names,
get_spp_from_taxon = TRUE,
Expand All @@ -46,16 +46,17 @@ get_datelife_result_datelifequery <- function(datelife_query = NULL,
# do that later in summarizing steps
results_list <- lapply(cache$trees,
get_subset_array_dispatch,
taxa = datelife_query$cleaned_names,
taxa = gsub(" ", "_", datelife_query$cleaned_names),
phy = NULL
)
# length(results_list) is always the same size as the number of chronograms in the database
datelife_result <- results_list_process(results_list,
datelife_query$cleaned_names,
partial = partial
)
message("Search done!")
message("\nInput taxon names were found in ", length(datelife_result), " chronograms.")
class(datelife_result) <- c("datelifeResult")
class(datelife_result) <- c(class(datelife_result), "datelifeResult")
attr(datelife_result, "datelife_query") <- datelife_query
return(datelife_result)
}
Expand Down
6 changes: 1 addition & 5 deletions R/datelife_search.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,8 @@ datelife_search <- function(input = c("Rhea americana", "Pterocnemia pennata", "
datelife_result.here <- get_datelife_result_datelifequery(
datelife_query = datelife_query,
partial = partial,
# approximate_match = approximate_match,
cache = cache)
# print.datelife(datelife_result = datelife_result.here)
# datelife <- list(datelife_query = datelife_query, datelife_result = datelife_result.here)
# class(datelife) <- "datelife"
# return(datelife)

res <- summarize_datelife_result(
datelife_result = datelife_result.here,
datelife_query = datelife_query,
Expand Down
19 changes: 15 additions & 4 deletions R/datelife_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,28 @@ get_taxon_summary <- function(datelife_result = NULL,
datelife_query = NULL) {

# datelife_result <- check_datelife_result(datelife_result)
if (is.null(datelife_result) | !inherits(datelife_result, "datelifeResult")) {
if (is.null(datelife_result) | !inherits(datelife_result, "list")) {
warning("'datelife_result' argument must be a list of patristic matrices (you can get one with get_datelife_result()).")
message("Taxon summary can not be generated.")
return(NA)
}
if (any(!sapply(datelife_result, inherits, "array"))) {
warning("Some (or all) elements of 'datelife_result' list are not of class 'array' (you can get the correct format from get_datelife_result()).")
message("Taxon summary can not be generated.")
return(NA)
}
if (length(datelife_result) == 0) {
warning("'datelife_result' is empty (length == 0).")
message("Taxon summary can not be generated.")
return(NA)
}

if (suppressMessages(is_datelife_query(input = datelife_query))) {
if (is.null(attributes(datelife_result)$datelife_query)) {
cleaned_names <- datelife_query$cleaned_names
cleaned_names <- gsub(" ", "_", datelife_query$cleaned_names)
} else {
input <- attributes(datelife_result)$datelife_query
cleaned_names <- attributes(datelife_result)$datelife_query$cleaned_names
input$cleaned_names <- gsub(" ", "_", input$cleaned_names)
cleaned_names <- input$cleaned_names
}
} else {
message("'datelife_query' argument was not provided.")
Expand Down
203 changes: 190 additions & 13 deletions R/opentree_chronograms.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,182 @@
#' @keywords opentree dates myrs million years time phylogeny chronogram
#' @details
#' Generated with
#' devtools::install_github("ropensci/rotl", ref = devtools::github_pull("137"))
#' remotes::install_github("ROpenSci/bibtex")'
#' opentree_chronograms <- get_opentree_chronograms()
#' opentree_chronograms$update <- Sys.time()
#' opentree_chronograms$version <- '2022.01.28'
#' opentree_chronograms$version <- '2023.12.30'
#' usethis::use_data(opentree_chronograms, overwrite = T, compress = "xz")
#' and updated with update_datelife_cache()
"opentree_chronograms"

#' Get all chronograms from Open Tree of Life database using direct call from Open Tree API
#' @param max_tree_count Default to "all", it gets all available chronograms. For testing purposes, a numeric value indicating the max number of trees to be cached.
#' @return A list of 4 elements:
#' \describe{
#' \item{authors}{A list of lists of author names of the original studies that
#' published chronograms currently stored in the Open Tree of Life database.}
#' \item{curators}{A list of lists of curator names that uploaded chronograms
#' to the Open Tree of Life database.}
#' \item{studies}{A list of study identifiers from original studies that
#' published chronograms currently stored in the Open Tree of Life database.}
#' \item{trees}{A `multiPhylo` object storing the chronograms from Open Tree of
#' Life database.}
#' \item{update}{A character vector indicating the time when the database object
#' was last updated.}
#' \item{version}{A character vector indicating the datelife package version when the
#' object was last updated.}
#' }
#' @export
get_opentree_chronograms <- function(max_tree_count = "all") {
options(warn = 1)
start_time <- Sys.time() # to register run time
result <- httr::POST("https://api.opentreeoflife.org/cached/v3/studies/find_trees",
body = '{"property":"ot:branchLengthTimeUnit","value":"Myr", "verbose":true}',
httr::add_headers(.headers = "content-type:application/json"))

output <- httr::content(result)
# length(output$matched_studies)
data_frame_output <- data.frame(study_id = character(), tree_id = character())
for(i in output) {
for (e in i) {
for (tree in e$matched_trees) {
if(tree$`ot:branchLengthMode` == "ot:time"){
study_id = tree$`ot:studyId`
tree_id = tree$`ot:treeId`
data_frame_output <- rbind(data_frame_output, c(study_id, tree_id))
}
}
}
}
names(data_frame_output) <- c("study_id", "tree_id")
trees <- list()
authors <- list()
curators <- list()
studies <- list()
dois <- list()
tree_count <- 0
bad_ones <- c()
ott_id_problems <- data.frame(study_id = character(), tree_id = character()) # nrow(fix_negative_brlen) is 0
# utils::opentree_chronograms
for (id in seq(nrow(data_frame_output))) {
# the only purpose for the following conditional is testing:
if (is.numeric(max_tree_count)) {
if (tree_count > max_tree_count) {
break
}
}
study_id <- data_frame_output$study_id[id] # pg_2853, pg_753
# study_id <- "ot_708"; study_id <- "pg_2853"
# tree_id <- "tree1"; tree_id <- "tree6624"
tree_id <- data_frame_output$tree_id[id] # tree6624, tree1383
message("Downloading tree(s) from study '",
study_id,
"' (",
tree_count + 1,
" out of ",
length(unique(data_frame_output$study_id)),
" studies).")
new_tree_taxon_name <- new_tree_ott_id <- NULL
# if (!grepl("\\.\\.\\.", tree_id) & !is.na(tree_id)) { # we do not need to deal with ellipsis bug anymore
message("Tree with tree_id = '", tree_id, "'")
new_tree_taxon_name <- tryCatch(rotl::get_study_tree(study_id = study_id,
tree_id = tree_id,
tip_label = "ott_taxon_name",
deduplicate = TRUE),
error = function(e) NULL)
# would like to dedup; don't use get_study_subtree, as right now it doesn't take tip_label args
# try(new_tree <- datelife:::get_study_tree_with_dups(study_id=study_id,tree_id=tree_id ))
# try(new_tree <- rotl::get_study_subtree(study_id=study_id,tree_id=tree_id, tip_label="ott_taxon_name", subtree_id="ingroup")) #only want ingroup, as it's the one that's been lovingly curated.
if (!is.null(new_tree_taxon_name) & phylo_has_brlen(phy = new_tree_taxon_name)) {
# add ott_ids
# right now the function is having trouble to retrieve trees with ott ids as tip labels from certain studies
new_tree_ott_id <- tryCatch(rotl::get_study_tree(study_id = study_id,
tree_id = tree_id,
tip_label = "ott_id"),
error = function(e) NULL)
# remove the tag to unmapped tips, but leave the "*tip" marker:
# new_tree_taxon_name$ott_ids <- gsub("_.*", "", new_tree_ott_id$tip.label)
# if new_tree2 is null it will generate an empty vector
# we were running TNRS for unmapped tip labels, but it is unsafe, so do not do it:
# try_tree <- datelife::clean_ott_chronogram(new_tree)
# previous line will give NA if just one or no tip labels are mapped to OTT??
new_tree_taxon_name$ott_ids <- suppressWarnings(as.numeric(new_tree_ott_id$tip.label))
if (all(is.na(new_tree_taxon_name$ott_ids))) {
problem <- "Open Tree of Life database does not have ott ids for this tree (names have not been curated)"
ott_id_problems <- rbind(ott_id_problems, data.frame(study_id, tree_id))
}
if (phylo_has_brlen(phy = new_tree_taxon_name)) {
if (is_good_chronogram(new_tree_taxon_name)) {
message("'", tree_id, "' has branch lengths and is ultrametric.")
study_meta <- rotl::get_study_meta(study_id)
publication_meta <- rotl::get_publication(study_meta)
doi <- NULL
try(doi <- gsub("https?://(dx\\.)?doi.org/", "", attr(publication_meta, "DOI")))
authors <- append(authors, NA)
if (length(doi) == 0) {
warning(paste(study_id, "has no DOI attribute, author names will not be retrieved."))
} else {
try(authors[length(authors)] <- list(paste(as.character(knitcitations::bib_metadata(doi)$author))))
}
curators <- append(curators, NA)
try(curators[length(curators)] <- list(study_meta[["nexml"]][["^ot:curatorName"]]))
try(studies <- append(studies, study_id))
tree_count <- tree_count + 1
# print(tree_count)
try(dois <- append(dois, doi))
trees[[tree_count]] <- new_tree_taxon_name
names(trees)[tree_count] <- publication_meta[1]
message("'", tree_id, "' was added to chronogram database.")
potential_bad <- NULL
}
}
# add taxonomic nodelabels to trees object here
new_tree_taxon_name <- map_nodes_ott(tree = new_tree_taxon_name)
}


if (!is.null(potential_bad)) {
bad_ones <- append(bad_ones, potential_bad)
}

}
message("Problematic combos:")
# Fix name encoding:
message(paste0(utils::capture.output(bad_ones), collapse = "\n"))
for (i in sequence(length(authors))) {
if (!any(is.na(authors[[i]])) & length(authors[[i]]) > 0) {
Encoding(authors[[i]]) <- "latin1"
authors[[i]] <- iconv(authors[[i]], "latin1", "UTF-8")
}
for (j in sequence(length(curators[[i]]))) {
if (!any(is.na(curators[[i]][[j]])) & length(curators[[i]][[j]]) > 0) {
Encoding(curators[[i]][[j]]) <- "latin1"
curators[[i]][[j]] <- iconv(curators[[i]][[j]], "latin1", "UTF-8")
}
}
}

if (nrow(ott_id_problems) > 0) {
problems_file <- file.path(tempdir(), paste0("ott_id_problems_", max_tree_count, ".csv"))
utils::write.csv(ott_id_problems,
file = problems_file,
quote = FALSE, row.names = FALSE
)
message("Problematic chronograms were saved to ", problems_file)
} else {
message("There were no problematic chronograms.")
}
tot_time <- Sys.time() - start_time # end of registering function running time
class(trees) <- "multiPhylo"
result <- list(trees = trees,
authors = authors,
curators = curators,
studies = studies,
dois = dois,
update = Sys.time(),
version = utils::packageVersion("datelife"))
attr(result, "running_time") <- tot_time
message(tot_time)
return(result)
}

#' Create an updated OpenTree chronograms database object

Expand Down Expand Up @@ -134,17 +301,22 @@ phylo_has_brlen <- function(phy) {
#' \item{version}{A character vector indicating the datelife package version when the
#' object was last updated.}
#' }
#' @export
get_opentree_chronograms <- function(max_tree_count = "all") {
# #' @export
get_opentree_chronograms_slow <- function(max_tree_count = "all") {
options(warn = 1)
start_time <- Sys.time() # to register run time
chronogram_matches <- rotl::studies_find_trees(property = "ot:branchLengthTimeUnit",
value = "Myr",
detailed = TRUE,
verbose = FALSE)
# rotl::studies_find_trees throws this Warning in rbind(c(n_trees = "1", tree_ids = "tree6160", candidate = "tree6160", :
message()
chronogram_matches <- suppressWarnings(
rotl::studies_find_trees(property = "ot:branchLengthTimeUnit",
value = "Myr",
detailed = TRUE,
verbose = FALSE))
# rotl::studies_find_trees throws this Warning for different studies:
# Warning in rbind(c(n_trees = "1", tree_ids = "tree6160", candidate = "tree6160", : pg_2641
# tree6160 <- rotl::get_study_tree(study_id = "pg_2641", tree_id = "tree6160")
# number of columns of result is not a multiple of vector length (arg 1) colnames(chronogram_matches)
# rotl::list_trees(chronogram_matches)
# study_id = "ot_615", tree_id = "Tr76459"
trees <- list()
authors <- list()
curators <- list()
Expand Down Expand Up @@ -298,8 +470,8 @@ is_good_chronogram <- function(phy) {
warning("tree failed over not having more internal nodes than tips")
}
if (length(which(grepl("not.mapped", phy$tip.label))) > 0) {
warning("tree failed over having not mapped taxa that should have been checked")
passing <- FALSE # not cleaned properly
warning("tree has tip labels that are not mapped to the Open Tree Taxonomy")
passing <- TRUE # not cleaned properly
}
# enhance: test that there are no duplicated labels in chronogram:
if (any(is.na(phy$tip.label))) {
Expand Down Expand Up @@ -443,6 +615,11 @@ map_nodes_ott <- function(tree) {
# tree <- ss[[1]]
# phy <- opentree_chronograms$trees[[i]]
phy <- tree_check(tree = tree, dated = FALSE)
# check that at least some tip labels have OTT names:
if (all(is.na(tree$ott_ids))) {
warning("OTT nodes could not be mapped on tree.")
return(tree)
}
cc <- tryCatch(classification_paths_from_taxonomy(phy$tip.label, sources = "Open Tree of Life Reference Taxonomy"),
error = function(e) list(resolved = c())
) # this traps the error when phy = opentree_chronograms$trees[[50]] from load(data-raw/opentree_chronograms_oct2018.rda)
Expand Down
1 change: 1 addition & 0 deletions R/opentree_taxonomy_general.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ tnrs_match.default <- function(input, reference_taxonomy = "ott", ...) { # enhan
toupper(reference_taxonomy),
").")
input <- stringr::str_trim(input, side = "both") # cleans the input of lingering unneeded white spaces
input <- gsub("_", " ", input) # do tnrs with spaces and not underscore to avoid bug from rotl v3.0.14 (felis_margarita to Margarites_helicinus)
allinput <- input
input <- unique(input)
# enhance: infer taxonomic contexts:
Expand Down
Loading
Loading