Skip to content

Commit

Permalink
Merge pull request #91 from LunaSare/master
Browse files Browse the repository at this point in the history
chronogram database update (Jan 2024)
  • Loading branch information
LunaSare authored Apr 26, 2024
2 parents c668b40 + ea114fb commit 2c848b2
Show file tree
Hide file tree
Showing 273 changed files with 1,454 additions and 454 deletions.
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: datelife
Title: Scientific Data on Time of Lineage Divergence for Your Taxa
Version: 0.6.8
Version: 0.6.9
Maintainer: Luna L. Sanchez Reyes <[email protected]>
Authors@R: c(
person("Brian", "O'Meara", email = "[email protected]", role=c("aut")),
Expand Down Expand Up @@ -69,7 +69,8 @@ Suggests:
devtools,
covr,
msa,
Biostrings
Biostrings,
spelling
LazyDataCompression: xz
SystemRequirements: PATHd8
URL: https://github.com/phylotastic/datelife, http://phylotastic.org/datelife/
Expand All @@ -79,3 +80,4 @@ RoxygenNote: 7.2.3
Roxygen: list(markdown = TRUE)
Encoding: UTF-8
VignetteBuilder: knitr
Language: en-US
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@
DONE:
-->
# datelife v0.6.9
- fix bug in `check_ott_input()`
- fix bug in `make_datelife_query()` when getting ott ids
- use spaces instead of "_" to run tnrs, avoids bug from `rotl::tnrs_match_names()` v3.0.14
- faster and more accurate way to get study ids and tree ids from opentree API on `get_opentree_chronograms()` that have branch lengths in Myrs, no relative time.
- update to chronogram database, now with 292 chronograms.
- `get_taxon_summary()` now manages case when `datelife_result` is empty; throws warning instead of criptic error.
- better `testthat` suite for `datelife_search()` inner functions.


# datelife v0.6.8
- fix bug in function `extract_calibrations_phylo()`
- update messages in `calibrations_match()`
Expand Down
2 changes: 1 addition & 1 deletion R/calibrations_use.R
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ use_calibrations <- function(phy = NULL,
# enhance: add a check for calibrations object structure
if (!inherits(calibrations, "data.frame")) {
exit <- TRUE
msg2 <- "'calibrations' is NOT a data frame. Check this."
msg2 <- "'calibrations' is NOT a data frame. Correct this."
} else {
msg2 <- "'calibrations' is a data frame. You are good."
}
Expand Down
12 changes: 6 additions & 6 deletions R/datelife_query.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,20 @@ make_datelife_query <- function(input = c("Rhea americana", "Pterocnemia pennata
if (use_tnrs_global) {
# process names even if it's a "higher" taxon name:
cleaned_names_tnrs <- clean_tnrs(tnrs_match(input = cleaned_names),
remove_nonmatches = TRUE)
remove_nonmatches = FALSE)
# recover original names of invalid taxa and unmatched:
cleaned_names <- gsub("_", " ", cleaned_names)
ii <- !tolower(cleaned_names) %in% cleaned_names_tnrs$search_string
cleaned_names <- c(cleaned_names_tnrs$unique_name, cleaned_names[ii])
cleaned_names <- gsub(" ", "_", cleaned_names)
if (inherits(phy_new, "phylo")) {
if (is.null(phy_new$ott_ids)) {
# after some tests, decided to use rotl's method instead of taxize::gnr_resolve, and just output the original input and the actual query for users to check out.
# cleaned_names <- taxize::gnr_resolve(names = cleaned_names, data_source_ids=179, fields="all")$matched_name
# rename the tip labels with tnrs matched names
# rename phy tip labels with TNRS matched names:
ott_ids <- rep(NA, length(cleaned_names))
ii <- match(cleaned_names_tnrs$search_string,
tolower(phy_new$tip.label))
phy_new$tip.label[ii] <- cleaned_names[ii]
ii <- match(gsub(" ", "_", cleaned_names_tnrs$search_string),
tolower(gsub(" ", "_", phy_new$tip.label)))
phy_new$tip.label[ii] <- gsub(" ", "_", cleaned_names)[ii]
ott_ids[ii] <- cleaned_names_tnrs$ott_id
phy_new$ott_ids <- ott_ids
}
Expand Down
7 changes: 4 additions & 3 deletions R/datelife_result.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ get_datelife_result_datelifequery <- function(datelife_query = NULL,
stop("'datelife_query' must be a 'datelifeQuery' object.")
}
if (length(datelife_query$cleaned_names) == 1) {
message("Can't get divergence times from just one taxon in 'datelife_query$cleaned_names'.")
message("Can't get divergence times from just one taxon available in 'datelife_query$cleaned_names'.")
message("Making a 'datelifeQuery' again, setting 'get_spp_from_taxon = TRUE'.")
datelife_query <- make_datelife_query(input = datelife_query$cleaned_names,
get_spp_from_taxon = TRUE,
Expand All @@ -46,16 +46,17 @@ get_datelife_result_datelifequery <- function(datelife_query = NULL,
# do that later in summarizing steps
results_list <- lapply(cache$trees,
get_subset_array_dispatch,
taxa = datelife_query$cleaned_names,
taxa = gsub(" ", "_", datelife_query$cleaned_names),
phy = NULL
)
# length(results_list) is always the same size as the number of chronograms in the database
datelife_result <- results_list_process(results_list,
datelife_query$cleaned_names,
partial = partial
)
message("Search done!")
message("\nInput taxon names were found in ", length(datelife_result), " chronograms.")
class(datelife_result) <- c("datelifeResult")
class(datelife_result) <- c(class(datelife_result), "datelifeResult")
attr(datelife_result, "datelife_query") <- datelife_query
return(datelife_result)
}
Expand Down
6 changes: 1 addition & 5 deletions R/datelife_search.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,8 @@ datelife_search <- function(input = c("Rhea americana", "Pterocnemia pennata", "
datelife_result.here <- get_datelife_result_datelifequery(
datelife_query = datelife_query,
partial = partial,
# approximate_match = approximate_match,
cache = cache)
# print.datelife(datelife_result = datelife_result.here)
# datelife <- list(datelife_query = datelife_query, datelife_result = datelife_result.here)
# class(datelife) <- "datelife"
# return(datelife)

res <- summarize_datelife_result(
datelife_result = datelife_result.here,
datelife_query = datelife_query,
Expand Down
19 changes: 15 additions & 4 deletions R/datelife_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,28 @@ get_taxon_summary <- function(datelife_result = NULL,
datelife_query = NULL) {

# datelife_result <- check_datelife_result(datelife_result)
if (is.null(datelife_result) | !inherits(datelife_result, "datelifeResult")) {
if (is.null(datelife_result) | !inherits(datelife_result, "list")) {
warning("'datelife_result' argument must be a list of patristic matrices (you can get one with get_datelife_result()).")
message("Taxon summary can not be generated.")
return(NA)
}
if (any(!sapply(datelife_result, inherits, "array"))) {
warning("Some (or all) elements of 'datelife_result' list are not of class 'array' (you can get the correct format from get_datelife_result()).")
message("Taxon summary can not be generated.")
return(NA)
}
if (length(datelife_result) == 0) {
warning("'datelife_result' is empty (length == 0).")
message("Taxon summary can not be generated.")
return(NA)
}

if (suppressMessages(is_datelife_query(input = datelife_query))) {
if (is.null(attributes(datelife_result)$datelife_query)) {
cleaned_names <- datelife_query$cleaned_names
cleaned_names <- gsub(" ", "_", datelife_query$cleaned_names)
} else {
input <- attributes(datelife_result)$datelife_query
cleaned_names <- attributes(datelife_result)$datelife_query$cleaned_names
input$cleaned_names <- gsub(" ", "_", input$cleaned_names)
cleaned_names <- input$cleaned_names
}
} else {
message("'datelife_query' argument was not provided.")
Expand Down
203 changes: 190 additions & 13 deletions R/opentree_chronograms.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,182 @@
#' @keywords opentree dates myrs million years time phylogeny chronogram
#' @details
#' Generated with
#' devtools::install_github("ropensci/rotl", ref = devtools::github_pull("137"))
#' remotes::install_github("ROpenSci/bibtex")'
#' opentree_chronograms <- get_opentree_chronograms()
#' opentree_chronograms$update <- Sys.time()
#' opentree_chronograms$version <- '2022.01.28'
#' opentree_chronograms$version <- '2023.12.30'
#' usethis::use_data(opentree_chronograms, overwrite = T, compress = "xz")
#' and updated with update_datelife_cache()
"opentree_chronograms"

#' Get all chronograms from Open Tree of Life database using direct call from Open Tree API
#' @param max_tree_count Default to "all", it gets all available chronograms. For testing purposes, a numeric value indicating the max number of trees to be cached.
#' @return A list of 4 elements:
#' \describe{
#' \item{authors}{A list of lists of author names of the original studies that
#' published chronograms currently stored in the Open Tree of Life database.}
#' \item{curators}{A list of lists of curator names that uploaded chronograms
#' to the Open Tree of Life database.}
#' \item{studies}{A list of study identifiers from original studies that
#' published chronograms currently stored in the Open Tree of Life database.}
#' \item{trees}{A `multiPhylo` object storing the chronograms from Open Tree of
#' Life database.}
#' \item{update}{A character vector indicating the time when the database object
#' was last updated.}
#' \item{version}{A character vector indicating the datelife package version when the
#' object was last updated.}
#' }
#' @export
get_opentree_chronograms <- function(max_tree_count = "all") {
options(warn = 1)
start_time <- Sys.time() # to register run time
result <- httr::POST("https://api.opentreeoflife.org/cached/v3/studies/find_trees",
body = '{"property":"ot:branchLengthTimeUnit","value":"Myr", "verbose":true}',
httr::add_headers(.headers = "content-type:application/json"))

output <- httr::content(result)
# length(output$matched_studies)
data_frame_output <- data.frame(study_id = character(), tree_id = character())
for(i in output) {
for (e in i) {
for (tree in e$matched_trees) {
if(tree$`ot:branchLengthMode` == "ot:time"){
study_id = tree$`ot:studyId`
tree_id = tree$`ot:treeId`
data_frame_output <- rbind(data_frame_output, c(study_id, tree_id))
}
}
}
}
names(data_frame_output) <- c("study_id", "tree_id")
trees <- list()
authors <- list()
curators <- list()
studies <- list()
dois <- list()
tree_count <- 0
bad_ones <- c()
ott_id_problems <- data.frame(study_id = character(), tree_id = character()) # nrow(fix_negative_brlen) is 0
# utils::opentree_chronograms
for (id in seq(nrow(data_frame_output))) {
# the only purpose for the following conditional is testing:
if (is.numeric(max_tree_count)) {
if (tree_count > max_tree_count) {
break
}
}
study_id <- data_frame_output$study_id[id] # pg_2853, pg_753
# study_id <- "ot_708"; study_id <- "pg_2853"
# tree_id <- "tree1"; tree_id <- "tree6624"
tree_id <- data_frame_output$tree_id[id] # tree6624, tree1383
message("Downloading tree(s) from study '",
study_id,
"' (",
tree_count + 1,
" out of ",
length(unique(data_frame_output$study_id)),
" studies).")
new_tree_taxon_name <- new_tree_ott_id <- NULL
# if (!grepl("\\.\\.\\.", tree_id) & !is.na(tree_id)) { # we do not need to deal with ellipsis bug anymore
message("Tree with tree_id = '", tree_id, "'")
new_tree_taxon_name <- tryCatch(rotl::get_study_tree(study_id = study_id,
tree_id = tree_id,
tip_label = "ott_taxon_name",
deduplicate = TRUE),
error = function(e) NULL)
# would like to dedup; don't use get_study_subtree, as right now it doesn't take tip_label args
# try(new_tree <- datelife:::get_study_tree_with_dups(study_id=study_id,tree_id=tree_id ))
# try(new_tree <- rotl::get_study_subtree(study_id=study_id,tree_id=tree_id, tip_label="ott_taxon_name", subtree_id="ingroup")) #only want ingroup, as it's the one that's been lovingly curated.
if (!is.null(new_tree_taxon_name) & phylo_has_brlen(phy = new_tree_taxon_name)) {
# add ott_ids
# right now the function is having trouble to retrieve trees with ott ids as tip labels from certain studies
new_tree_ott_id <- tryCatch(rotl::get_study_tree(study_id = study_id,
tree_id = tree_id,
tip_label = "ott_id"),
error = function(e) NULL)
# remove the tag to unmapped tips, but leave the "*tip" marker:
# new_tree_taxon_name$ott_ids <- gsub("_.*", "", new_tree_ott_id$tip.label)
# if new_tree2 is null it will generate an empty vector
# we were running TNRS for unmapped tip labels, but it is unsafe, so do not do it:
# try_tree <- datelife::clean_ott_chronogram(new_tree)
# previous line will give NA if just one or no tip labels are mapped to OTT??
new_tree_taxon_name$ott_ids <- suppressWarnings(as.numeric(new_tree_ott_id$tip.label))
if (all(is.na(new_tree_taxon_name$ott_ids))) {
problem <- "Open Tree of Life database does not have ott ids for this tree (names have not been curated)"
ott_id_problems <- rbind(ott_id_problems, data.frame(study_id, tree_id))
}
if (phylo_has_brlen(phy = new_tree_taxon_name)) {
if (is_good_chronogram(new_tree_taxon_name)) {
message("'", tree_id, "' has branch lengths and is ultrametric.")
study_meta <- rotl::get_study_meta(study_id)
publication_meta <- rotl::get_publication(study_meta)
doi <- NULL
try(doi <- gsub("https?://(dx\\.)?doi.org/", "", attr(publication_meta, "DOI")))
authors <- append(authors, NA)
if (length(doi) == 0) {
warning(paste(study_id, "has no DOI attribute, author names will not be retrieved."))
} else {
try(authors[length(authors)] <- list(paste(as.character(knitcitations::bib_metadata(doi)$author))))
}
curators <- append(curators, NA)
try(curators[length(curators)] <- list(study_meta[["nexml"]][["^ot:curatorName"]]))
try(studies <- append(studies, study_id))
tree_count <- tree_count + 1
# print(tree_count)
try(dois <- append(dois, doi))
trees[[tree_count]] <- new_tree_taxon_name
names(trees)[tree_count] <- publication_meta[1]
message("'", tree_id, "' was added to chronogram database.")
potential_bad <- NULL
}
}
# add taxonomic nodelabels to trees object here
new_tree_taxon_name <- map_nodes_ott(tree = new_tree_taxon_name)
}


if (!is.null(potential_bad)) {
bad_ones <- append(bad_ones, potential_bad)
}

}
message("Problematic combos:")
# Fix name encoding:
message(paste0(utils::capture.output(bad_ones), collapse = "\n"))
for (i in sequence(length(authors))) {
if (!any(is.na(authors[[i]])) & length(authors[[i]]) > 0) {
Encoding(authors[[i]]) <- "latin1"
authors[[i]] <- iconv(authors[[i]], "latin1", "UTF-8")
}
for (j in sequence(length(curators[[i]]))) {
if (!any(is.na(curators[[i]][[j]])) & length(curators[[i]][[j]]) > 0) {
Encoding(curators[[i]][[j]]) <- "latin1"
curators[[i]][[j]] <- iconv(curators[[i]][[j]], "latin1", "UTF-8")
}
}
}

if (nrow(ott_id_problems) > 0) {
problems_file <- file.path(tempdir(), paste0("ott_id_problems_", max_tree_count, ".csv"))
utils::write.csv(ott_id_problems,
file = problems_file,
quote = FALSE, row.names = FALSE
)
message("Problematic chronograms were saved to ", problems_file)
} else {
message("There were no problematic chronograms.")
}
tot_time <- Sys.time() - start_time # end of registering function running time
class(trees) <- "multiPhylo"
result <- list(trees = trees,
authors = authors,
curators = curators,
studies = studies,
dois = dois,
update = Sys.time(),
version = utils::packageVersion("datelife"))
attr(result, "running_time") <- tot_time
message(tot_time)
return(result)
}

#' Create an updated OpenTree chronograms database object

Expand Down Expand Up @@ -134,17 +301,22 @@ phylo_has_brlen <- function(phy) {
#' \item{version}{A character vector indicating the datelife package version when the
#' object was last updated.}
#' }
#' @export
get_opentree_chronograms <- function(max_tree_count = "all") {
# #' @export
get_opentree_chronograms_slow <- function(max_tree_count = "all") {
options(warn = 1)
start_time <- Sys.time() # to register run time
chronogram_matches <- rotl::studies_find_trees(property = "ot:branchLengthTimeUnit",
value = "Myr",
detailed = TRUE,
verbose = FALSE)
# rotl::studies_find_trees throws this Warning in rbind(c(n_trees = "1", tree_ids = "tree6160", candidate = "tree6160", :
message()
chronogram_matches <- suppressWarnings(
rotl::studies_find_trees(property = "ot:branchLengthTimeUnit",
value = "Myr",
detailed = TRUE,
verbose = FALSE))
# rotl::studies_find_trees throws this Warning for different studies:
# Warning in rbind(c(n_trees = "1", tree_ids = "tree6160", candidate = "tree6160", : pg_2641
# tree6160 <- rotl::get_study_tree(study_id = "pg_2641", tree_id = "tree6160")
# number of columns of result is not a multiple of vector length (arg 1) colnames(chronogram_matches)
# rotl::list_trees(chronogram_matches)
# study_id = "ot_615", tree_id = "Tr76459"
trees <- list()
authors <- list()
curators <- list()
Expand Down Expand Up @@ -298,8 +470,8 @@ is_good_chronogram <- function(phy) {
warning("tree failed over not having more internal nodes than tips")
}
if (length(which(grepl("not.mapped", phy$tip.label))) > 0) {
warning("tree failed over having not mapped taxa that should have been checked")
passing <- FALSE # not cleaned properly
warning("tree has tip labels that are not mapped to the Open Tree Taxonomy")
passing <- TRUE # not cleaned properly
}
# enhance: test that there are no duplicated labels in chronogram:
if (any(is.na(phy$tip.label))) {
Expand Down Expand Up @@ -443,6 +615,11 @@ map_nodes_ott <- function(tree) {
# tree <- ss[[1]]
# phy <- opentree_chronograms$trees[[i]]
phy <- tree_check(tree = tree, dated = FALSE)
# check that at least some tip labels have OTT names:
if (all(is.na(tree$ott_ids))) {
warning("OTT nodes could not be mapped on tree.")
return(tree)
}
cc <- tryCatch(classification_paths_from_taxonomy(phy$tip.label, sources = "Open Tree of Life Reference Taxonomy"),
error = function(e) list(resolved = c())
) # this traps the error when phy = opentree_chronograms$trees[[50]] from load(data-raw/opentree_chronograms_oct2018.rda)
Expand Down
1 change: 1 addition & 0 deletions R/opentree_taxonomy_general.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ tnrs_match.default <- function(input, reference_taxonomy = "ott", ...) { # enhan
toupper(reference_taxonomy),
").")
input <- stringr::str_trim(input, side = "both") # cleans the input of lingering unneeded white spaces
input <- gsub("_", " ", input) # do tnrs with spaces and not underscore to avoid bug from rotl v3.0.14 (felis_margarita to Margarites_helicinus)
allinput <- input
input <- unique(input)
# enhance: infer taxonomic contexts:
Expand Down
Loading

0 comments on commit 2c848b2

Please sign in to comment.