From 9eeb7d8d276c416448ebfae7444c13533129d8bf Mon Sep 17 00:00:00 2001 From: Alexey Sergushichev Date: Wed, 27 Mar 2024 14:54:55 -0500 Subject: [PATCH] Switch to alserglab.wustl.edu/hsds, support for ARCHS4 v2.3 --- DESCRIPTION | 10 +-- NEWS | 3 + R/getHSDSFileList.R | 96 +++++++++++----------- R/loadCountsFromH5file.R | 8 +- R/updateAndCreateMetaLocal.R | 3 +- R/updateAndCreateMetaRemote.R | 2 +- README.md | 46 +++++------ README.rmd | 4 +- man/getHSDSFileList.Rd | 4 +- man/loadCountsFromH5FileHSDS.Rd | 4 +- man/loadCountsFromHSDS.Rd | 7 +- tests/testthat/test-loadCountsFromH5file.R | 6 +- vignettes/phantasusLite-tutorial.Rmd | 10 ++- 13 files changed, 108 insertions(+), 95 deletions(-) create mode 100644 NEWS diff --git a/DESCRIPTION b/DESCRIPTION index 0c193a9..c7cafb0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: phantasusLite Type: Package -Title: Loading and annotation RNA-Seq counts matrices -Version: 1.1.0 +Title: Loading and annotation RNA-seq counts matrices +Version: 1.1.1 Authors@R: c(person("Rita", "Sablina", role = "aut"), person("Maxim", "Kleverov", role = "aut"), person("Alexey", "Sergushichev", email = "alsergbox@gmail.com", role = c("aut", "cre"))) @@ -9,10 +9,10 @@ Description: PhantasusLite – a lightweight package with helper functions of ge extracted from phantasus package. In parituclar it simplifies working with public RNA-seq datasets from GEO by providing access to the remote HSDS repository with the precomputed gene counts from ARCHS4 and DEE2 projects. -Depends: R (>= 4.3) +Depends: R (>= 4.2) Imports: data.table, - rhdf5client(>= 1.21.5), + rhdf5client(>= 1.25.1), httr, stringr, stats, utils, @@ -22,7 +22,7 @@ biocViews: GeneExpression, Transcriptomics, RNASeq License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Suggests: testthat (>= 3.0.0), knitr, diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..736c62c --- /dev/null +++ b/NEWS @@ -0,0 +1,3 @@ +Changes in version 1.2.0 +* Switch to https://alserglab.wustl.edu/hsds remote for default HSDS server +* Depending on rhdf5client >= 1.25.1 to support ARCHS4 v2.3 files \ No newline at end of file diff --git a/R/getHSDSFileList.R b/R/getHSDSFileList.R index 9f7bae4..03011a1 100755 --- a/R/getHSDSFileList.R +++ b/R/getHSDSFileList.R @@ -1,48 +1,48 @@ -#' Returns list of all HDF5-files on HSDS-server -#' @param url, containing url of the server and root domain. -#' @param directory, containing name of the directory -#' -#' @return List of all HDF5-files on the server or all files of the collection -#' -#' @export -#' @import rhdf5client -#' @examples -#' url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' -#' getHSDSFileList(url) -#' - -getHSDSFileList <- function(url='https://ctlab.itmo.ru/hsds/?domain=/counts', directory = NULL) { - src <- httr::parse_url(url) - dir <- src$query$domain - src <- paste0(src$scheme,'://',src$hostname,'/',src$path) - src <- HSDSSource(src) - hdf5FileList <- list() - if (is.null(directory)) { - directories <- listDomains(src, dir) - directories <- directories[-grep("*\\.h5$", directories)] - directories <- gsub(paste0(dir, '/'), '', directories) - for (directory in directories) { - request <- paste0(src@endpoint, "/domains?domain=", - dir, '/', directory) - response <- rhdf5client:::submitRequest(request) - domains <- response[["domains"]] - for (domain in domains) { - if (domain$name != paste0(dir, "/", directory, '/', directory, ".h5")) { - hdf5FileList <- append(hdf5FileList, domain$name) - } - } - } - } else { - request <- paste0(src@endpoint, "/domains?domain=", - dir, '/', directory) - response <- rhdf5client:::submitRequest(request) - domains <- response[["domains"]] - for (domain in domains) { - if (domain$name != paste0(dir, "/", directory, '/', directory, ".h5")) { - hdf5FileList <- append(hdf5FileList, domain$name) - } - } - } - hdf5FileList <- unlist(hdf5FileList) - return(hdf5FileList) -} +#' Returns list of all HDF5-files on HSDS-server +#' @param url, containing url of the server and root domain. +#' @param directory, containing name of the directory +#' +#' @return List of all HDF5-files on the server or all files of the collection +#' +#' @export +#' @import rhdf5client +#' @examples +#' url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' +#' getHSDSFileList(url) +#' + +getHSDSFileList <- function(url='https://alserglab.wustl.edu/hsds/?domain=/counts', directory = NULL) { + src <- httr::parse_url(url) + dir <- src$query$domain + src <- paste0(src$scheme,'://',src$hostname,'/',src$path) + src <- HSDSSource(src) + hdf5FileList <- list() + if (is.null(directory)) { + directories <- listDomains(src, dir) + directories <- directories[-grep("*\\.h5$", directories)] + directories <- gsub(paste0(dir, '/'), '', directories) + for (directory in directories) { + request <- paste0(src@endpoint, "/domains?domain=", + dir, '/', directory) + response <- rhdf5client:::submitRequest(request) + domains <- response[["domains"]] + for (domain in domains) { + if (domain$name != paste0(dir, "/", directory, '/', directory, ".h5")) { + hdf5FileList <- append(hdf5FileList, domain$name) + } + } + } + } else { + request <- paste0(src@endpoint, "/domains?domain=", + dir, '/', directory) + response <- rhdf5client:::submitRequest(request) + domains <- response[["domains"]] + for (domain in domains) { + if (domain$name != paste0(dir, "/", directory, '/', directory, ".h5")) { + hdf5FileList <- append(hdf5FileList, domain$name) + } + } + } + hdf5FileList <- unlist(hdf5FileList) + return(hdf5FileList) +} diff --git a/R/loadCountsFromH5file.R b/R/loadCountsFromH5file.R index 968c945..d6de53b 100644 --- a/R/loadCountsFromH5file.R +++ b/R/loadCountsFromH5file.R @@ -35,10 +35,10 @@ getSamples <- function(h5f, samples_id) { #' @examples #' ess <- GEOquery::getGEO("GSE85653") #' es <- ess[[1]] -#' url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' +#' url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' #' file <- "/dee2/athaliana_star_matrix_20221107.h5" #' es <- loadCountsFromH5FileHSDS(es, url, file) -loadCountsFromH5FileHSDS <- function(es, url='https://ctlab.itmo.ru/hsds/?domain=/counts', file, sampleIndexes = NULL) { +loadCountsFromH5FileHSDS <- function(es, url='https://alserglab.wustl.edu/hsds/?domain=/counts', file, sampleIndexes = NULL) { if (nrow(es) > 0) { return(es) } @@ -129,10 +129,10 @@ loadCountsFromH5FileHSDS <- function(es, url='https://ctlab.itmo.ru/hsds/?domain #' @examples #' ess <- GEOquery::getGEO("GSE85653") #' es <- ess[[1]] -#' url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' +#' url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' #' es <- loadCountsFromHSDS(es, url) #' -loadCountsFromHSDS <- function(es, url='https://ctlab.itmo.ru/hsds/?domain=/counts') { +loadCountsFromHSDS <- function(es, url='https://alserglab.wustl.edu/hsds/?domain=/counts') { if (nrow(es) > 0) { return(es) } diff --git a/R/updateAndCreateMetaLocal.R b/R/updateAndCreateMetaLocal.R index f0f195d..34a9b23 100644 --- a/R/updateAndCreateMetaLocal.R +++ b/R/updateAndCreateMetaLocal.R @@ -35,6 +35,7 @@ createMetaH5 <- function(counts_dir){ message("Skipping ", h5filename, " as it's already exists") next } + message("Creating ", h5filename) createH5(h5_meta, h5filename, 'meta') } return(invisible(NULL)) @@ -193,7 +194,7 @@ createIndexH5 <- function(data, file) { for (i in seq_along(names)) { rhdf5::h5write(data[[i]], file, paste0("/",names[i])) } - h5closeAll() + rhdf5::h5closeAll() return(invisible(NULL)) } diff --git a/R/updateAndCreateMetaRemote.R b/R/updateAndCreateMetaRemote.R index 8750923..4bb968f 100644 --- a/R/updateAndCreateMetaRemote.R +++ b/R/updateAndCreateMetaRemote.R @@ -58,7 +58,7 @@ createIndexH5Remote <- function(url, collections=c('archs4', 'dee2'), destfile="index.h5") { if (file.exists(destfile)) { - stop("File ", destfile, " alsready exists") + stop("File ", destfile, " already exists") } DT_h5_meta <- getIndexRemote(url, collections) diff --git a/README.md b/README.md index 6a15d12..1db666d 100755 --- a/README.md +++ b/README.md @@ -67,28 +67,28 @@ Function loadCountsFromHSDS returns an ExpressionSet with the expression matrix – the second exprs(es) contains an expression matrix. The remote repository URL is -‘’. +‘’. ``` r -url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' +url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' es <- loadCountsFromHSDS(es, url) head(exprs(es)) ``` - ## GSM1281300 GSM1281301 GSM1281302 GSM1281303 GSM1281304 GSM1281305 - ## 0610007P14Rik 86 67 30 46 23 61 - ## 0610009B22Rik 29 22 3 0 33 13 - ## 0610009L18Rik 0 0 7 0 0 15 - ## 0610009O20Rik 103 38 17 20 31 54 - ## 0610010F05Rik 259 91 115 88 113 185 - ## 0610010K14Rik 17 6 0 0 1 0 - ## GSM1281306 GSM1281307 - ## 0610007P14Rik 105 22 - ## 0610009B22Rik 15 26 - ## 0610009L18Rik 0 9 - ## 0610009O20Rik 24 29 - ## 0610010F05Rik 108 163 - ## 0610010K14Rik 0 7 + ## GSM1281300 GSM1281301 GSM1281302 GSM1281303 GSM1281304 + ## ENSMUSG00000000001 1015 603 561 549 425 + ## ENSMUSG00000000003 0 0 0 0 0 + ## ENSMUSG00000000028 109 34 0 14 9 + ## ENSMUSG00000000031 0 18 0 0 0 + ## ENSMUSG00000000037 0 0 0 0 0 + ## ENSMUSG00000000049 0 0 0 0 0 + ## GSM1281305 GSM1281306 GSM1281307 + ## ENSMUSG00000000001 853 407 479 + ## ENSMUSG00000000003 0 0 0 + ## ENSMUSG00000000028 165 0 15 + ## ENSMUSG00000000031 0 0 0 + ## ENSMUSG00000000037 0 0 0 + ## ENSMUSG00000000049 0 0 0 The available gene annotations are also filled in: @@ -96,10 +96,10 @@ The available gene annotations are also filled in: head(fData(es)) ``` - ## ENSEMBLID Gene Symbol - ## 0610007P14Rik missing 0610007P14Rik - ## 0610009B22Rik ENSMUSG00000007777 0610009B22Rik - ## 0610009L18Rik ENSMUSG00000043644 0610009L18Rik - ## 0610009O20Rik missing 0610009O20Rik - ## 0610010F05Rik ENSMUSG00000042208 0610010F05Rik - ## 0610010K14Rik ENSMUSG00000020831 0610010K14Rik + ## Gene symbol ENSEMBLID + ## ENSMUSG00000000001 Gnai3 ENSMUSG00000000001 + ## ENSMUSG00000000003 Pbsn ENSMUSG00000000003 + ## ENSMUSG00000000028 Cdc45 ENSMUSG00000000028 + ## ENSMUSG00000000031 H19 ENSMUSG00000000031 + ## ENSMUSG00000000037 Scml2 ENSMUSG00000000037 + ## ENSMUSG00000000049 Apoh ENSMUSG00000000049 diff --git a/README.rmd b/README.rmd index 3d650bd..c13773c 100755 --- a/README.rmd +++ b/README.rmd @@ -63,10 +63,10 @@ head(exprs(es)) Function loadCountsFromHSDS returns an ExpressionSet with the expression matrix -- the second exprs(es) contains an expression matrix. -The remote repository URL is ''. +The remote repository URL is ''. ```{r} -url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' +url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' es <- loadCountsFromHSDS(es, url) head(exprs(es)) ``` diff --git a/man/getHSDSFileList.Rd b/man/getHSDSFileList.Rd index 185b57e..6636fa3 100644 --- a/man/getHSDSFileList.Rd +++ b/man/getHSDSFileList.Rd @@ -5,7 +5,7 @@ \title{Returns list of all HDF5-files on HSDS-server} \usage{ getHSDSFileList( - url = "https://ctlab.itmo.ru/hsds/?domain=/counts", + url = "https://alserglab.wustl.edu/hsds/?domain=/counts", directory = NULL ) } @@ -21,7 +21,7 @@ List of all HDF5-files on the server or all files of the collection Returns list of all HDF5-files on HSDS-server } \examples{ -url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' +url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' getHSDSFileList(url) } diff --git a/man/loadCountsFromH5FileHSDS.Rd b/man/loadCountsFromH5FileHSDS.Rd index 617dab4..404c63b 100644 --- a/man/loadCountsFromH5FileHSDS.Rd +++ b/man/loadCountsFromH5FileHSDS.Rd @@ -6,7 +6,7 @@ \usage{ loadCountsFromH5FileHSDS( es, - url = "https://ctlab.itmo.ru/hsds/?domain=/counts", + url = "https://alserglab.wustl.edu/hsds/?domain=/counts", file, sampleIndexes = NULL ) @@ -29,7 +29,7 @@ Load count matrix from remote HDF5-file \examples{ ess <- GEOquery::getGEO("GSE85653") es <- ess[[1]] -url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' +url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' file <- "/dee2/athaliana_star_matrix_20221107.h5" es <- loadCountsFromH5FileHSDS(es, url, file) } diff --git a/man/loadCountsFromHSDS.Rd b/man/loadCountsFromHSDS.Rd index 44734b6..f440c4c 100644 --- a/man/loadCountsFromHSDS.Rd +++ b/man/loadCountsFromHSDS.Rd @@ -4,7 +4,10 @@ \alias{loadCountsFromHSDS} \title{Load count matrix from HDF5-files.} \usage{ -loadCountsFromHSDS(es, url = "https://ctlab.itmo.ru/hsds/?domain=/counts") +loadCountsFromHSDS( + es, + url = "https://alserglab.wustl.edu/hsds/?domain=/counts" +) } \arguments{ \item{es, }{containing ExpressionSet loaded from GEO. Contains empty expression matrix.} @@ -20,7 +23,7 @@ Load count matrix from HDF5-files. \examples{ ess <- GEOquery::getGEO("GSE85653") es <- ess[[1]] -url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' +url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' es <- loadCountsFromHSDS(es, url) } diff --git a/tests/testthat/test-loadCountsFromH5file.R b/tests/testthat/test-loadCountsFromH5file.R index 58f6514..dd5dcd7 100644 --- a/tests/testthat/test-loadCountsFromH5file.R +++ b/tests/testthat/test-loadCountsFromH5file.R @@ -1,7 +1,7 @@ library(GEOquery) test_that("loadCountsFromHSDS works correctly", { - url <- "https://ctlab.itmo.ru/hsds/?domain=/counts" + url <- "https://alserglab.wustl.edu/hsds/?domain=/counts" ess <- getGEO("GSE85653", AnnotGPL = TRUE) es <- ess[[1]] es <- loadCountsFromHSDS(es, url) @@ -21,7 +21,7 @@ test_that("loadCountsFromHSDS works correctly", { test_that("loadCountsFromHSDS returns the same ExpressionSet, if it contains counts matrix", { - url <- "https://ctlab.itmo.ru/hsds/?domain=/counts" + url <- "https://alserglab.wustl.edu/hsds/?domain=/counts" ess <- getGEO("GSE10010") es1 <- ess[[1]] es2 <- loadCountsFromHSDS(es1, url) @@ -31,7 +31,7 @@ test_that("loadCountsFromHSDS returns the same ExpressionSet, if it contains cou test_that("loadCountsFromH5FileHSDS works without metadata params", { - url <- "https://ctlab.itmo.ru/hsds/?domain=/counts" + url <- "https://alserglab.wustl.edu/hsds/?domain=/counts" file <- 'archs4/Arabidopsis_thaliana_count_matrix.h5' ess <- getGEO("GSE85653", AnnotGPL = TRUE) es <- ess[[1]] diff --git a/vignettes/phantasusLite-tutorial.Rmd b/vignettes/phantasusLite-tutorial.Rmd index 0943dd1..2eb031a 100755 --- a/vignettes/phantasusLite-tutorial.Rmd +++ b/vignettes/phantasusLite-tutorial.Rmd @@ -60,11 +60,11 @@ RNA-seq dataset from GEO do not contain the expression matrix, thus `exprs(es)` head(exprs(es)) ``` -However, a number of precomputed gene count tables are available at HSDS server ''. It features HDF5 files with counts +However, a number of precomputed gene count tables are available at HSDS server ''. It features HDF5 files with counts from ARCHS4 and DEE2 projects: ```{r} -url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts' +url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts' getHSDSFileList(url) ``` @@ -92,6 +92,12 @@ The counts are different from the previous values as ARCHS4 counts were used -- preproc(experimentData(es))$gene_counts_source ``` +Further, gene symbols are also imported from ARCHS4 database and are available as feature data: +```{r} +head(fData(es)) +``` + + # Inferring sample groups For some of the GEO datasets, such as GSE53053, the sample annotation is not fully available.