Skip to content

Commit

Permalink
Switch to alserglab.wustl.edu/hsds, support for ARCHS4 v2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
assaron committed Mar 27, 2024
1 parent c888757 commit 9eeb7d8
Show file tree
Hide file tree
Showing 13 changed files with 108 additions and 95 deletions.
10 changes: 5 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
Package: phantasusLite
Type: Package
Title: Loading and annotation RNA-Seq counts matrices
Version: 1.1.0
Title: Loading and annotation RNA-seq counts matrices
Version: 1.1.1
Authors@R: c(person("Rita", "Sablina", role = "aut"),
person("Maxim", "Kleverov", role = "aut"),
person("Alexey", "Sergushichev", email = "[email protected]", role = c("aut", "cre")))
Description: PhantasusLite – a lightweight package with helper functions of general interest
extracted from phantasus package. In parituclar it simplifies working with public
RNA-seq datasets from GEO by providing access to the remote
HSDS repository with the precomputed gene counts from ARCHS4 and DEE2 projects.
Depends: R (>= 4.3)
Depends: R (>= 4.2)
Imports:
data.table,
rhdf5client(>= 1.21.5),
rhdf5client(>= 1.25.1),
httr, stringr,
stats,
utils,
Expand All @@ -22,7 +22,7 @@ biocViews: GeneExpression, Transcriptomics, RNASeq
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Suggests:
testthat (>= 3.0.0),
knitr,
Expand Down
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Changes in version 1.2.0
* Switch to https://alserglab.wustl.edu/hsds remote for default HSDS server
* Depending on rhdf5client >= 1.25.1 to support ARCHS4 v2.3 files
96 changes: 48 additions & 48 deletions R/getHSDSFileList.R
Original file line number Diff line number Diff line change
@@ -1,48 +1,48 @@
#' Returns list of all HDF5-files on HSDS-server
#' @param url, containing url of the server and root domain.
#' @param directory, containing name of the directory
#'
#' @return List of all HDF5-files on the server or all files of the collection
#'
#' @export
#' @import rhdf5client
#' @examples
#' url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts'
#' getHSDSFileList(url)
#'

getHSDSFileList <- function(url='https://ctlab.itmo.ru/hsds/?domain=/counts', directory = NULL) {
src <- httr::parse_url(url)
dir <- src$query$domain
src <- paste0(src$scheme,'://',src$hostname,'/',src$path)
src <- HSDSSource(src)
hdf5FileList <- list()
if (is.null(directory)) {
directories <- listDomains(src, dir)
directories <- directories[-grep("*\\.h5$", directories)]
directories <- gsub(paste0(dir, '/'), '', directories)
for (directory in directories) {
request <- paste0(src@endpoint, "/domains?domain=",
dir, '/', directory)
response <- rhdf5client:::submitRequest(request)
domains <- response[["domains"]]
for (domain in domains) {
if (domain$name != paste0(dir, "/", directory, '/', directory, ".h5")) {
hdf5FileList <- append(hdf5FileList, domain$name)
}
}
}
} else {
request <- paste0(src@endpoint, "/domains?domain=",
dir, '/', directory)
response <- rhdf5client:::submitRequest(request)
domains <- response[["domains"]]
for (domain in domains) {
if (domain$name != paste0(dir, "/", directory, '/', directory, ".h5")) {
hdf5FileList <- append(hdf5FileList, domain$name)
}
}
}
hdf5FileList <- unlist(hdf5FileList)
return(hdf5FileList)
}
#' Returns list of all HDF5-files on HSDS-server
#' @param url, containing url of the server and root domain.
#' @param directory, containing name of the directory
#'
#' @return List of all HDF5-files on the server or all files of the collection
#'
#' @export
#' @import rhdf5client
#' @examples
#' url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts'
#' getHSDSFileList(url)
#'

getHSDSFileList <- function(url='https://alserglab.wustl.edu/hsds/?domain=/counts', directory = NULL) {
src <- httr::parse_url(url)
dir <- src$query$domain
src <- paste0(src$scheme,'://',src$hostname,'/',src$path)
src <- HSDSSource(src)
hdf5FileList <- list()
if (is.null(directory)) {
directories <- listDomains(src, dir)
directories <- directories[-grep("*\\.h5$", directories)]
directories <- gsub(paste0(dir, '/'), '', directories)
for (directory in directories) {
request <- paste0(src@endpoint, "/domains?domain=",
dir, '/', directory)
response <- rhdf5client:::submitRequest(request)
domains <- response[["domains"]]
for (domain in domains) {
if (domain$name != paste0(dir, "/", directory, '/', directory, ".h5")) {
hdf5FileList <- append(hdf5FileList, domain$name)
}
}
}
} else {
request <- paste0(src@endpoint, "/domains?domain=",
dir, '/', directory)
response <- rhdf5client:::submitRequest(request)
domains <- response[["domains"]]
for (domain in domains) {
if (domain$name != paste0(dir, "/", directory, '/', directory, ".h5")) {
hdf5FileList <- append(hdf5FileList, domain$name)
}
}
}
hdf5FileList <- unlist(hdf5FileList)
return(hdf5FileList)
}
8 changes: 4 additions & 4 deletions R/loadCountsFromH5file.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ getSamples <- function(h5f, samples_id) {
#' @examples
#' ess <- GEOquery::getGEO("GSE85653")
#' es <- ess[[1]]
#' url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts'
#' url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts'
#' file <- "/dee2/athaliana_star_matrix_20221107.h5"
#' es <- loadCountsFromH5FileHSDS(es, url, file)
loadCountsFromH5FileHSDS <- function(es, url='https://ctlab.itmo.ru/hsds/?domain=/counts', file, sampleIndexes = NULL) {
loadCountsFromH5FileHSDS <- function(es, url='https://alserglab.wustl.edu/hsds/?domain=/counts', file, sampleIndexes = NULL) {
if (nrow(es) > 0) {
return(es)
}
Expand Down Expand Up @@ -129,10 +129,10 @@ loadCountsFromH5FileHSDS <- function(es, url='https://ctlab.itmo.ru/hsds/?domain
#' @examples
#' ess <- GEOquery::getGEO("GSE85653")
#' es <- ess[[1]]
#' url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts'
#' url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts'
#' es <- loadCountsFromHSDS(es, url)
#'
loadCountsFromHSDS <- function(es, url='https://ctlab.itmo.ru/hsds/?domain=/counts') {
loadCountsFromHSDS <- function(es, url='https://alserglab.wustl.edu/hsds/?domain=/counts') {
if (nrow(es) > 0) {
return(es)
}
Expand Down
3 changes: 2 additions & 1 deletion R/updateAndCreateMetaLocal.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ createMetaH5 <- function(counts_dir){
message("Skipping ", h5filename, " as it's already exists")
next
}
message("Creating ", h5filename)
createH5(h5_meta, h5filename, 'meta')
}
return(invisible(NULL))
Expand Down Expand Up @@ -193,7 +194,7 @@ createIndexH5 <- function(data, file) {
for (i in seq_along(names)) {
rhdf5::h5write(data[[i]], file, paste0("/",names[i]))
}
h5closeAll()
rhdf5::h5closeAll()
return(invisible(NULL))
}

Expand Down
2 changes: 1 addition & 1 deletion R/updateAndCreateMetaRemote.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ createIndexH5Remote <- function(url,
collections=c('archs4', 'dee2'),
destfile="index.h5") {
if (file.exists(destfile)) {
stop("File ", destfile, " alsready exists")
stop("File ", destfile, " already exists")
}

DT_h5_meta <- getIndexRemote(url, collections)
Expand Down
46 changes: 23 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,39 +67,39 @@ Function loadCountsFromHSDS returns an ExpressionSet with the expression
matrix – the second exprs(es) contains an expression matrix.

The remote repository URL is
<https://ctlab.itmo.ru/hsds/?domain=/counts>’.
<https://alserglab.wustl.edu/hsds/?domain=/counts>’.

``` r
url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts'
url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts'
es <- loadCountsFromHSDS(es, url)
head(exprs(es))
```

## GSM1281300 GSM1281301 GSM1281302 GSM1281303 GSM1281304 GSM1281305
## 0610007P14Rik 86 67 30 46 23 61
## 0610009B22Rik 29 22 3 0 33 13
## 0610009L18Rik 0 0 7 0 0 15
## 0610009O20Rik 103 38 17 20 31 54
## 0610010F05Rik 259 91 115 88 113 185
## 0610010K14Rik 17 6 0 0 1 0
## GSM1281306 GSM1281307
## 0610007P14Rik 105 22
## 0610009B22Rik 15 26
## 0610009L18Rik 0 9
## 0610009O20Rik 24 29
## 0610010F05Rik 108 163
## 0610010K14Rik 0 7
## GSM1281300 GSM1281301 GSM1281302 GSM1281303 GSM1281304
## ENSMUSG00000000001 1015 603 561 549 425
## ENSMUSG00000000003 0 0 0 0 0
## ENSMUSG00000000028 109 34 0 14 9
## ENSMUSG00000000031 0 18 0 0 0
## ENSMUSG00000000037 0 0 0 0 0
## ENSMUSG00000000049 0 0 0 0 0
## GSM1281305 GSM1281306 GSM1281307
## ENSMUSG00000000001 853 407 479
## ENSMUSG00000000003 0 0 0
## ENSMUSG00000000028 165 0 15
## ENSMUSG00000000031 0 0 0
## ENSMUSG00000000037 0 0 0
## ENSMUSG00000000049 0 0 0

The available gene annotations are also filled in:

``` r
head(fData(es))
```

## ENSEMBLID Gene Symbol
## 0610007P14Rik missing 0610007P14Rik
## 0610009B22Rik ENSMUSG00000007777 0610009B22Rik
## 0610009L18Rik ENSMUSG00000043644 0610009L18Rik
## 0610009O20Rik missing 0610009O20Rik
## 0610010F05Rik ENSMUSG00000042208 0610010F05Rik
## 0610010K14Rik ENSMUSG00000020831 0610010K14Rik
## Gene symbol ENSEMBLID
## ENSMUSG00000000001 Gnai3 ENSMUSG00000000001
## ENSMUSG00000000003 Pbsn ENSMUSG00000000003
## ENSMUSG00000000028 Cdc45 ENSMUSG00000000028
## ENSMUSG00000000031 H19 ENSMUSG00000000031
## ENSMUSG00000000037 Scml2 ENSMUSG00000000037
## ENSMUSG00000000049 Apoh ENSMUSG00000000049
4 changes: 2 additions & 2 deletions README.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ head(exprs(es))

Function loadCountsFromHSDS returns an ExpressionSet with the expression matrix -- the second exprs(es) contains an expression matrix.

The remote repository URL is '<https://ctlab.itmo.ru/hsds/?domain=/counts>'.
The remote repository URL is '<https://alserglab.wustl.edu/hsds/?domain=/counts>'.

```{r}
url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts'
url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts'
es <- loadCountsFromHSDS(es, url)
head(exprs(es))
```
Expand Down
4 changes: 2 additions & 2 deletions man/getHSDSFileList.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/loadCountsFromH5FileHSDS.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions man/loadCountsFromHSDS.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions tests/testthat/test-loadCountsFromH5file.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
library(GEOquery)

test_that("loadCountsFromHSDS works correctly", {
url <- "https://ctlab.itmo.ru/hsds/?domain=/counts"
url <- "https://alserglab.wustl.edu/hsds/?domain=/counts"
ess <- getGEO("GSE85653", AnnotGPL = TRUE)
es <- ess[[1]]
es <- loadCountsFromHSDS(es, url)
Expand All @@ -21,7 +21,7 @@ test_that("loadCountsFromHSDS works correctly", {


test_that("loadCountsFromHSDS returns the same ExpressionSet, if it contains counts matrix", {
url <- "https://ctlab.itmo.ru/hsds/?domain=/counts"
url <- "https://alserglab.wustl.edu/hsds/?domain=/counts"
ess <- getGEO("GSE10010")
es1 <- ess[[1]]
es2 <- loadCountsFromHSDS(es1, url)
Expand All @@ -31,7 +31,7 @@ test_that("loadCountsFromHSDS returns the same ExpressionSet, if it contains cou


test_that("loadCountsFromH5FileHSDS works without metadata params", {
url <- "https://ctlab.itmo.ru/hsds/?domain=/counts"
url <- "https://alserglab.wustl.edu/hsds/?domain=/counts"
file <- 'archs4/Arabidopsis_thaliana_count_matrix.h5'
ess <- getGEO("GSE85653", AnnotGPL = TRUE)
es <- ess[[1]]
Expand Down
10 changes: 8 additions & 2 deletions vignettes/phantasusLite-tutorial.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ RNA-seq dataset from GEO do not contain the expression matrix, thus `exprs(es)`
head(exprs(es))
```

However, a number of precomputed gene count tables are available at HSDS server '<https://ctlab.itmo.ru/hsds/>'. It features HDF5 files with counts
However, a number of precomputed gene count tables are available at HSDS server '<https://alserglab.wustl.edu/hsds/>'. It features HDF5 files with counts
from ARCHS4 and DEE2 projects:

```{r}
url <- 'https://ctlab.itmo.ru/hsds/?domain=/counts'
url <- 'https://alserglab.wustl.edu/hsds/?domain=/counts'
getHSDSFileList(url)
```

Expand Down Expand Up @@ -92,6 +92,12 @@ The counts are different from the previous values as ARCHS4 counts were used --
preproc(experimentData(es))$gene_counts_source
```

Further, gene symbols are also imported from ARCHS4 database and are available as feature data:
```{r}
head(fData(es))
```


# Inferring sample groups

For some of the GEO datasets, such as GSE53053, the sample annotation is not fully available.
Expand Down

0 comments on commit 9eeb7d8

Please sign in to comment.