Let’s inspect the results by doing normalization and then UMAP visulization.
And inspect the new results by UMAP.
-DefaultAssay(seurat_obj.combined) <- "integrated"
+DefaultAssay(seurat_obj.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
seurat_obj.combined <- RunPCA(seurat_obj.combined, npcs = 30, verbose = FALSE)
seurat_obj.combined <- RunUMAP(seurat_obj.combined, reduction = "pca", dims = 1:30)
-#> 10:16:53 UMAP embedding parameters a = 0.9922 b = 1.112
-#> 10:16:53 Read 10153 rows and found 30 numeric columns
-#> 10:16:53 Using Annoy for neighbor search, n_neighbors = 30
-#> 10:16:53 Building Annoy index with metric = cosine, n_trees = 50
+#> 14:49:10 UMAP embedding parameters a = 0.9922 b = 1.112
+#> 14:49:10 Read 10153 rows and found 30 numeric columns
+#> 14:49:10 Using Annoy for neighbor search, n_neighbors = 30
+#> 14:49:10 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
-#> 10:16:55 Writing NN index file to temp file /tmp/RtmpzP59ex/file8ed3ae2ddbc
-#> 10:16:55 Searching Annoy index using 1 thread, search_k = 3000
-#> 10:16:58 Annoy recall = 100%
-#> 10:16:59 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
-#> 10:16:59 Initializing from normalized Laplacian + noise (using RSpectra)
-#> 10:17:00 Commencing optimization for 200 epochs, with 409718 positive edges
-#> 10:17:04 Optimization finished
+Plot the UMAP.
# By assay
diff --git a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-12-1.png b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-12-1.png
index 21f4e8e35..eb3edd712 100644
Binary files a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-12-1.png and b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-12-1.png differ
diff --git a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-13-1.png b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-13-1.png
index a119ef287..6cd19d1c2 100644
Binary files a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-13-1.png and b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-13-1.png differ
diff --git a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-17-1.png b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-17-1.png
index af790e53b..f61613490 100644
Binary files a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-17-1.png and b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-17-1.png differ
diff --git a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-18-1.png b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-18-1.png
index 5e43455e5..c53282d06 100644
Binary files a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-18-1.png and b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-18-1.png differ
diff --git a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-22-1.png b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-22-1.png
index ef04f6b1f..71bf2f0e0 100644
Binary files a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-22-1.png and b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-22-1.png differ
diff --git a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-23-1.png b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-23-1.png
index c6c933c91..73c81af1c 100644
Binary files a/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-23-1.png and b/r/articles/comp_bio_data_integration_files/figure-html/unnamed-chunk-23-1.png differ
diff --git a/r/articles/comp_bio_normalizing_full_gene_sequencing.html b/r/articles/comp_bio_normalizing_full_gene_sequencing.html
index 9339d254e..0d6c7ed39 100644
--- a/r/articles/comp_bio_normalizing_full_gene_sequencing.html
+++ b/r/articles/comp_bio_normalizing_full_gene_sequencing.html
@@ -61,6 +61,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
@@ -137,9 +138,11 @@ Fetching full-le
)
liver_dataset
#> soma_joinid collection_id collection_name
-#> 1 525 0b9d8a04-bb9d-44da-aa27-705bb65b54eb Tabula Muris Senis
+#> 1 583 0b9d8a04-bb9d-44da-aa27-705bb65b54eb Tabula Muris Senis
#> collection_doi dataset_id
#> 1 10.1038/s41586-020-2496-1 4546e757-34d0-4d17-be06-538318925fcd
+#> dataset_version_id
+#> 1 0a851e26-a629-4e59-9b52-9b4d1ce4440b
#> dataset_title
#> 1 Liver - A single-cell transcriptomic atlas characterizes ageing tissues in the mouse - Smart-seq2
#> dataset_h5ad_path dataset_total_cell_count
@@ -168,8 +171,8 @@ Normalizing expressio
liver_seurat
#> An object of class Seurat
-#> 52392 features across 2859 samples within 1 assay
-#> Active assay: RNA (52392 features, 0 variable features)
+#> 52417 features across 2859 samples within 1 assay
+#> Active assay: RNA (52417 features, 0 variable features)
#> 2 layers present: counts, data
Let’s get the genes measured in this dataset.
@@ -187,13 +190,13 @@ Normalizing expressio
We can see that out of all genes in the Census 17,992 were measured in this dataset.
Now let’s normalize these genes by gene length. We can easily do this because the Census has gene lengths included in the gene metadata under feature_length
.
-GetAssayData(liver_seurat[1:5, 1:5], slot = "data")
+GetAssayData(liver_seurat[1:5, 1:5], slot = "data")
#> Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
#> ℹ Please use the `layer` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
#> 5 x 5 sparse Matrix of class "dgCMatrix"
-#> cell3530079 cell3530080 cell3530081 cell3530082 cell3530083
+#> cell3959639 cell3959640 cell3959641 cell3959642 cell3959643
#> ENSMUSG00000025900 . . . . .
#> ENSMUSG00000025902 . . . . 2250
#> ENSMUSG00000033845 . 559 1969 . .
@@ -201,14 +204,14 @@ Normalizing expressio
#> ENSMUSG00000033813 . . 828 1 54
gene_lengths <- liver_seurat$RNA@meta.features$feature_length
-liver_seurat <- SetAssayData(
+liver_seurat <- SetAssayData(
liver_seurat,
- new.data = sweep(GetAssayData(liver_seurat, slot = "data"), 1, gene_lengths, "/")
+ new.data = sweep(GetAssayData(liver_seurat, slot = "data"), 1, gene_lengths, "/")
)
-GetAssayData(liver_seurat[1:5, 1:5], slot = "data")
+GetAssayData(liver_seurat[1:5, 1:5], slot = "data")
#> 5 x 5 sparse Matrix of class "dgCMatrix"
-#> cell3530079 cell3530080 cell3530081 cell3530082 cell3530083
+#> cell3959639 cell3959640 cell3959641 cell3959642 cell3959643
#> ENSMUSG00000025900 . . . . .
#> ENSMUSG00000025902 . . . . 0.47150042
#> ENSMUSG00000033845 . 0.06586544 0.2320019 . .
@@ -222,7 +225,7 @@ Validation through clustering
Let’s perform some basic clustering analysis to see if cell types cluster as expected using the normalized counts.
First we do some basic filtering of cells and genes.
-cells_per_gene <- rowSums(GetAssayData(liver_seurat, slot = "counts") > 0)
+cells_per_gene <- rowSums(GetAssayData(liver_seurat, slot = "counts") > 0)
genes_per_cell <- Matrix::colSums(liver_seurat$RNA@counts > 0)
liver_seurat <- liver_seurat[cells_per_gene >= 5, genes_per_cell >= 500]
Then we normalize to account for sequencing depth and transform data to log scale.
@@ -247,7 +250,7 @@ Validation through clustering
liver_seurat <- RunPCA(
liver_seurat,
- features = VariableFeatures(object = liver_seurat)
+ features = VariableFeatures(object = liver_seurat)
)
liver_seurat <- FindNeighbors(liver_seurat, dims = 1:40)
liver_seurat <- RunUMAP(liver_seurat, dims = 1:40)
diff --git a/r/articles/comp_bio_summarize_axis_query.html b/r/articles/comp_bio_summarize_axis_query.html
index ad4bb24fd..e4b246118 100644
--- a/r/articles/comp_bio_summarize_axis_query.html
+++ b/r/articles/comp_bio_summarize_axis_query.html
@@ -61,6 +61,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
diff --git a/r/articles/index.html b/r/articles/index.html
index 8e340b2b5..440491e44 100644
--- a/r/articles/index.html
+++ b/r/articles/index.html
@@ -38,6 +38,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
@@ -88,6 +89,8 @@ API
- Querying and fetching the single-cell data and cell/gene metadata
+ -
+
- Generating citations for Census slices
-
- Census Datasets example
-
diff --git a/r/authors.html b/r/authors.html
index 6f4321c9b..d29e646b7 100644
--- a/r/authors.html
+++ b/r/authors.html
@@ -38,6 +38,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
@@ -76,14 +77,14 @@
Authors
Citation
Source: DESCRIPTION
- Chan Zuckerberg Initiative Foundation (2023).
+
Chan Zuckerberg Initiative Foundation (2024).
cellxgene.census: CZ CELLxGENE Discover Cell Census.
R package version 1.9.1, https://github.com/chanzuckerberg/cellxgene-census.
@Manual{,
title = {cellxgene.census: CZ CELLxGENE Discover Cell Census},
author = {{Chan Zuckerberg Initiative Foundation}},
- year = {2023},
+ year = {2024},
note = {R package version 1.9.1},
url = {https://github.com/chanzuckerberg/cellxgene-census},
}
diff --git a/r/index.html b/r/index.html
index d90e72f13..d7676cada 100644
--- a/r/index.html
+++ b/r/index.html
@@ -65,6 +65,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
diff --git a/r/pkgdown.yml b/r/pkgdown.yml
index 760d5821e..49945b046 100644
--- a/r/pkgdown.yml
+++ b/r/pkgdown.yml
@@ -3,6 +3,7 @@ pkgdown: 2.0.7
pkgdown_sha: ~
articles:
census_access_maintained_embeddings: census_access_maintained_embeddings.html
+ census_citation_generation: census_citation_generation.html
census_compute_over_X: census_compute_over_X.html
census_dataset_presence: census_dataset_presence.html
census_datasets: census_datasets.html
@@ -11,5 +12,5 @@ articles:
comp_bio_data_integration: comp_bio_data_integration.html
comp_bio_normalizing_full_gene_sequencing: comp_bio_normalizing_full_gene_sequencing.html
comp_bio_summarize_axis_query: comp_bio_summarize_axis_query.html
-last_built: 2023-12-18T17:31Z
+last_built: 2024-01-08T22:03Z
diff --git a/r/reference/download_source_h5ad.html b/r/reference/download_source_h5ad.html
index 2fb90fa8a..783a0833b 100644
--- a/r/reference/download_source_h5ad.html
+++ b/r/reference/download_source_h5ad.html
@@ -38,6 +38,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
diff --git a/r/reference/get_census_version_description.html b/r/reference/get_census_version_description.html
index 8cc77f51c..d06f4fdd2 100644
--- a/r/reference/get_census_version_description.html
+++ b/r/reference/get_census_version_description.html
@@ -38,6 +38,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
@@ -91,15 +92,15 @@ Value
Examples
as.data.frame(get_census_version_description("stable"))
#> release_date release_build
-#> 1 2023-07-25
+#> 1 2023-12-15
#> soma.uri
-#> 1 s3://cellxgene-census-public-us-west-2/cell-census/2023-07-25/soma/
+#> 1 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/
#> soma.relative_uri soma.s3_region
-#> 1 /cell-census/2023-07-25/soma/ us-west-2
+#> 1 /cell-census/2023-12-15/soma/ us-west-2
#> h5ads.uri
-#> 1 s3://cellxgene-census-public-us-west-2/cell-census/2023-07-25/h5ads/
+#> 1 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/h5ads/
#> h5ads.relative_uri h5ads.s3_region do_not_delete lts alias
-#> 1 /cell-census/2023-07-25/h5ads/ us-west-2 TRUE TRUE stable
+#> 1 /cell-census/2023-12-15/h5ads/ us-west-2 TRUE TRUE stable
#> census_version
#> 1 stable
diff --git a/r/reference/get_census_version_directory.html b/r/reference/get_census_version_directory.html
index c7bf2e322..72705876a 100644
--- a/r/reference/get_census_version_directory.html
+++ b/r/reference/get_census_version_directory.html
@@ -38,6 +38,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
@@ -86,71 +87,77 @@ Value
Examples
get_census_version_directory()
#> release_date release_build
-#> stable 2023-07-25
-#> latest 2023-12-11
+#> stable 2023-12-15
+#> latest 2024-01-01
#> 2023-05-15 2023-05-15
#> 2023-07-25 2023-07-25
#> 2023-10-23 2023-10-23
-#> 2023-11-13 2023-11-13
-#> 2023-11-20 2023-11-20
-#> 2023-11-27 2023-11-27
#> 2023-12-11 2023-12-11
#> 2023-12-15 2023-12-15
+#> 2023-12-18 2023-12-18
+#> 2023-12-21 2023-12-21
+#> 2023-12-25 2023-12-25
+#> 2024-01-01 2024-01-01
#> soma.uri
-#> stable s3://cellxgene-census-public-us-west-2/cell-census/2023-07-25/soma/
-#> latest s3://cellxgene-census-public-us-west-2/cell-census/2023-12-11/soma/
+#> stable s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/
+#> latest s3://cellxgene-census-public-us-west-2/cell-census/2024-01-01/soma/
#> 2023-05-15 s3://cellxgene-census-public-us-west-2/cell-census/2023-05-15/soma/
#> 2023-07-25 s3://cellxgene-census-public-us-west-2/cell-census/2023-07-25/soma/
#> 2023-10-23 s3://cellxgene-census-public-us-west-2/cell-census/2023-10-23/soma/
-#> 2023-11-13 s3://cellxgene-census-public-us-west-2/cell-census/2023-11-13/soma/
-#> 2023-11-20 s3://cellxgene-census-public-us-west-2/cell-census/2023-11-20/soma/
-#> 2023-11-27 s3://cellxgene-census-public-us-west-2/cell-census/2023-11-27/soma/
#> 2023-12-11 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-11/soma/
#> 2023-12-15 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/soma/
+#> 2023-12-18 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-18/soma/
+#> 2023-12-21 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-21/soma/
+#> 2023-12-25 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-25/soma/
+#> 2024-01-01 s3://cellxgene-census-public-us-west-2/cell-census/2024-01-01/soma/
#> soma.relative_uri soma.s3_region
-#> stable /cell-census/2023-07-25/soma/ us-west-2
-#> latest /cell-census/2023-12-11/soma/ us-west-2
+#> stable /cell-census/2023-12-15/soma/ us-west-2
+#> latest /cell-census/2024-01-01/soma/ us-west-2
#> 2023-05-15 /cell-census/2023-05-15/soma/ us-west-2
#> 2023-07-25 /cell-census/2023-07-25/soma/ us-west-2
#> 2023-10-23 /cell-census/2023-10-23/soma/ us-west-2
-#> 2023-11-13 /cell-census/2023-11-13/soma/ us-west-2
-#> 2023-11-20 /cell-census/2023-11-20/soma/ us-west-2
-#> 2023-11-27 /cell-census/2023-11-27/soma/ us-west-2
#> 2023-12-11 /cell-census/2023-12-11/soma/ us-west-2
#> 2023-12-15 /cell-census/2023-12-15/soma/ us-west-2
+#> 2023-12-18 /cell-census/2023-12-18/soma/ us-west-2
+#> 2023-12-21 /cell-census/2023-12-21/soma/ us-west-2
+#> 2023-12-25 /cell-census/2023-12-25/soma/ us-west-2
+#> 2024-01-01 /cell-census/2024-01-01/soma/ us-west-2
#> h5ads.uri
-#> stable s3://cellxgene-census-public-us-west-2/cell-census/2023-07-25/h5ads/
-#> latest s3://cellxgene-census-public-us-west-2/cell-census/2023-12-11/h5ads/
+#> stable s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/h5ads/
+#> latest s3://cellxgene-census-public-us-west-2/cell-census/2024-01-01/h5ads/
#> 2023-05-15 s3://cellxgene-census-public-us-west-2/cell-census/2023-05-15/h5ads/
#> 2023-07-25 s3://cellxgene-census-public-us-west-2/cell-census/2023-07-25/h5ads/
#> 2023-10-23 s3://cellxgene-census-public-us-west-2/cell-census/2023-10-23/h5ads/
-#> 2023-11-13 s3://cellxgene-census-public-us-west-2/cell-census/2023-11-13/h5ads/
-#> 2023-11-20 s3://cellxgene-census-public-us-west-2/cell-census/2023-11-20/h5ads/
-#> 2023-11-27 s3://cellxgene-census-public-us-west-2/cell-census/2023-11-27/h5ads/
#> 2023-12-11 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-11/h5ads/
#> 2023-12-15 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-15/h5ads/
+#> 2023-12-18 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-18/h5ads/
+#> 2023-12-21 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-21/h5ads/
+#> 2023-12-25 s3://cellxgene-census-public-us-west-2/cell-census/2023-12-25/h5ads/
+#> 2024-01-01 s3://cellxgene-census-public-us-west-2/cell-census/2024-01-01/h5ads/
#> h5ads.relative_uri h5ads.s3_region do_not_delete lts
-#> stable /cell-census/2023-07-25/h5ads/ us-west-2 TRUE TRUE
-#> latest /cell-census/2023-12-11/h5ads/ us-west-2 FALSE NA
+#> stable /cell-census/2023-12-15/h5ads/ us-west-2 TRUE TRUE
+#> latest /cell-census/2024-01-01/h5ads/ us-west-2 FALSE NA
#> 2023-05-15 /cell-census/2023-05-15/h5ads/ us-west-2 TRUE TRUE
#> 2023-07-25 /cell-census/2023-07-25/h5ads/ us-west-2 TRUE TRUE
-#> 2023-10-23 /cell-census/2023-10-23/h5ads/ us-west-2 TRUE FALSE
-#> 2023-11-13 /cell-census/2023-11-13/h5ads/ us-west-2 FALSE NA
-#> 2023-11-20 /cell-census/2023-11-20/h5ads/ us-west-2 FALSE NA
-#> 2023-11-27 /cell-census/2023-11-27/h5ads/ us-west-2 FALSE NA
+#> 2023-10-23 /cell-census/2023-10-23/h5ads/ us-west-2 FALSE FALSE
#> 2023-12-11 /cell-census/2023-12-11/h5ads/ us-west-2 FALSE NA
#> 2023-12-15 /cell-census/2023-12-15/h5ads/ us-west-2 TRUE TRUE
+#> 2023-12-18 /cell-census/2023-12-18/h5ads/ us-west-2 FALSE NA
+#> 2023-12-21 /cell-census/2023-12-21/h5ads/ us-west-2 FALSE NA
+#> 2023-12-25 /cell-census/2023-12-25/h5ads/ us-west-2 FALSE NA
+#> 2024-01-01 /cell-census/2024-01-01/h5ads/ us-west-2 FALSE NA
#> alias
#> stable stable
#> latest latest
#> 2023-05-15
#> 2023-07-25
#> 2023-10-23
-#> 2023-11-13
-#> 2023-11-20
-#> 2023-11-27
#> 2023-12-11
#> 2023-12-15
+#> 2023-12-18
+#> 2023-12-21
+#> 2023-12-25
+#> 2024-01-01
diff --git a/r/reference/open_soma.html b/r/reference/open_soma.html
index c91b8087b..1922ec33a 100644
--- a/r/reference/open_soma.html
+++ b/r/reference/open_soma.html
@@ -38,6 +38,7 @@
Querying and fetching the single-cell data and cell/gene metadata
+ Generating citations for Census slices
Census Datasets example
Genes measured in each cell (dataset presence matrix)
Computing on X using online (incremental) algorithms
@@ -114,16 +115,16 @@ Value
Examples
census <- open_soma()
-#> The stable Census release is currently 2023-07-25. Specify census_version = "2023-07-25" in future calls to open_soma() to ensure data consistency.
+#> The stable Census release is currently 2023-12-15. Specify census_version = "2023-12-15" in future calls to open_soma() to ensure data consistency.
as.data.frame(census$get("census_info")$get("summary")$read()$concat())
#> soma_joinid label value
-#> 1 0 census_schema_version 1.0.0
-#> 2 1 census_build_date 2023-07-25
-#> 3 2 dataset_schema_version 3.0.0
-#> 4 3 total_cell_count 61656118
-#> 5 4 unique_cell_count 37447773
-#> 6 5 number_donors_homo_sapiens 13035
-#> 7 6 number_donors_mus_musculus 1417
+#> 1 0 census_schema_version 1.2.0
+#> 2 1 census_build_date 2023-10-23
+#> 3 2 dataset_schema_version 3.1.0
+#> 4 3 total_cell_count 68683222
+#> 5 4 unique_cell_count 40356133
+#> 6 5 number_donors_homo_sapiens 15588
+#> 7 6 number_donors_mus_musculus 1990
census$close()
diff --git a/r/search.json b/r/search.json
index 33d6de3ed..df1e3514c 100644
--- a/r/search.json
+++ b/r/search.json
@@ -1 +1 @@
-[{"path":"/LICENSE.html","id":null,"dir":"","previous_headings":"","what":"MIT License","title":"MIT License","text":"Copyright (c) 2023 Chan Zuckerberg Initiative Foundation Permission hereby granted, free charge, person obtaining copy software associated documentation files (“Software”), deal Software without restriction, including without limitation rights use, copy, modify, merge, publish, distribute, sublicense, /sell copies Software, permit persons Software furnished , subject following conditions: copyright notice permission notice shall included copies substantial portions Software. SOFTWARE PROVIDED “”, WITHOUT WARRANTY KIND, EXPRESS IMPLIED, INCLUDING LIMITED WARRANTIES MERCHANTABILITY, FITNESS PARTICULAR PURPOSE NONINFRINGEMENT. EVENT SHALL AUTHORS COPYRIGHT HOLDERS LIABLE CLAIM, DAMAGES LIABILITY, WHETHER ACTION CONTRACT, TORT OTHERWISE, ARISING , CONNECTION SOFTWARE USE DEALINGS SOFTWARE.","code":""},{"path":"/articles/census_access_maintained_embeddings.html","id":"open-census","dir":"Articles","previous_headings":"","what":"Open Census","title":"Access CELLxGENE collaboration embeddings (scVI, Geneformer)","text":"","code":"library(\"cellxgene.census\") census <- open_soma(census_version = \"2023-12-15\")"},{"path":"/articles/census_access_maintained_embeddings.html","id":"load-embeddings-as-seurat-reductions","dir":"Articles","previous_headings":"","what":"Load embeddings as Seurat reductions","title":"Access CELLxGENE collaboration embeddings (scVI, Geneformer)","text":"high-level cellxgene.census::get_seurat() function can query Census load embeddings dimensional reductions Seurat object. ask Seurat object expression data human cells tissue_general equal 'central nervous system', along scVI geneformer embeddings (obsm_layers). embeddings stored dimensional reductions seurat_obj, can take quick look scVI embeddings 2D scatter plot via UMAP, colored Census cell_type annotations.","code":"library(\"Seurat\") seurat_obj <- get_seurat( census, organism = \"homo_sapiens\", obs_value_filter = \"tissue_general == 'central nervous system'\", obs_column_names = c(\"cell_type\"), obsm_layers = c(\"scvi\",\"geneformer\") ) seurat_obj <- RunUMAP( seurat_obj, reduction = \"scvi\", dims=1:ncol(Embeddings(seurat_obj, \"scvi\")) ) DimPlot(seurat_obj, reduction = \"umap\", group.by = \"cell_type\") + theme(legend.text = element_text(size = 8))"},{"path":"/articles/census_access_maintained_embeddings.html","id":"load-embeddings-as-singlecellexperiment-reductions","dir":"Articles","previous_headings":"","what":"Load embeddings as SingleCellExperiment reductions","title":"Access CELLxGENE collaboration embeddings (scVI, Geneformer)","text":"Similarly, cellxgene.census::get_single_cell_experiment() can query Census store embeddings dimensionality reduction results Bioconductor SingleCellExperiment object. , can view UMAP Geneformer embeddings colored cell_type.","code":"library(\"SingleCellExperiment\") sce_obj <- get_single_cell_experiment( census, organism = \"homo_sapiens\", obs_value_filter = \"tissue_general == 'central nervous system'\", obs_column_names = c(\"cell_type\"), obsm_layers = c(\"scvi\",\"geneformer\") ) sce_obj <- scater::runUMAP(sce_obj, dimred = \"geneformer\") scater::plotReducedDim(sce_obj, dimred = \"UMAP\", colour_by = \"cell_type\")"},{"path":"/articles/census_access_maintained_embeddings.html","id":"load-embeddings-as-sparsematrix","dir":"Articles","previous_headings":"","what":"Load embeddings as sparseMatrix","title":"Access CELLxGENE collaboration embeddings (scVI, Geneformer)","text":"Lastly, can use SOMAExperimentAxisQuery lower-level access embeddings’ numerical data. can performant use cases don’t need features Seurat SingleCellExperiment. row embeddings sparseMatrix provides fine-tuned Geneformer model’s 512-dimensional embedding vector cell, cell soma_joinids row names. different arguments, SOMAExperimentAxisQuery$to_sparse_matrix() can also read scVI embeddings expression data. Still lower-level access available SOMAExperimentAxisQuery$read(), streams Arrow tables. methods SOMAExperimentAxisQuery can fetch metadata like cell_type: SOMAExperimentAxisQuery loads ask Census, unlike high-level get_seurat() get_single_cell_experiment() functions, eagerly populate objects based query.","code":"query <- census$get(\"census_data\")$get(\"homo_sapiens\")$axis_query( \"RNA\", obs_query = tiledbsoma::SOMAAxisQuery$new(value_filter = \"tissue == 'tongue'\") ) embeddings <- query$to_sparse_matrix(\"obsm\", \"geneformer\") str(embeddings) #> Formal class 'dgTMatrix' [package \"Matrix\"] with 6 slots #> ..@ i : int [1:190464] 0 0 0 0 0 0 0 0 0 0 ... #> ..@ j : int [1:190464] 0 1 2 3 4 5 6 7 8 9 ... #> ..@ Dim : int [1:2] 372 512 #> ..@ Dimnames:List of 2 #> .. ..$ : chr [1:372] \"51784858\" \"51784859\" \"51784860\" \"51784861\" ... #> .. ..$ : chr [1:512] \"0\" \"1\" \"2\" \"3\" ... #> ..@ x : num [1:190464] 0.1104 -1.2031 1.0078 0.0131 1.2422 ... #> ..@ factors : list() head(as.data.frame(query$obs(column_names = c(\"soma_joinid\",\"cell_type\"))$concat())) #> soma_joinid cell_type #> 1 51784858 basal cell #> 2 51784859 basal cell #> 3 51784860 fibroblast #> 4 51784861 fibroblast #> 5 51784862 basal cell #> 6 51784863 basal cell census$close()"},{"path":"/articles/census_compute_over_X.html","id":"incremental-mean-calculation","dir":"Articles","previous_headings":"","what":"Incremental mean calculation","title":"Computing on X using online (incremental) algorithms","text":"Many statistics, marginal means, easy calculate incrementally. Let’s begin query X$raw sparse matrix unnormalized read counts, return results shards incrementally accumulate read count gene, divide cell count get mean reads per cell gene. First define query - case slice obs axis cells specific tissue & sex value, genes var axis. query$X() method returns iterator results, Arrow Table. table contain sparse X data obs/var coordinates, using standard SOMA names: soma_data - X values (float32) soma_dim_0 - obs coordinate (int64) soma_dim_1 - var coordinate (int64) Important: X matrices joined var/obs axis DataFrames integer join “id” (aka soma_joinid). positionally indexed, given cell gene may soma_joinid value (e.g., large integer). words, given X value, soma_dim_0 corresponds soma_joinid obs dataframe, soma_dim_1 coordinate corresponds soma_joinid var dataframe. convenience, query class includes utility simplify operations query slices. query$indexer indexer used wrap output query$X(), converting soma_joinids positional indexing query results. Positions [0, N), N number results query given axis. Key points: expensive query read results - rather make multiple passes data, read perform multiple computations. default, data census indexed soma_joinid positionally.","code":"library(\"tiledbsoma\") library(\"cellxgene.census\") census <- open_soma() query <- census$get(\"census_data\")$get(\"mus_musculus\")$axis_query( measurement_name = \"RNA\", obs_query = SOMAAxisQuery$new(value_filter = \"tissue=='brain' && sex=='male'\") ) genes_df <- query$var(column_names = c(\"feature_id\", \"feature_name\"))$concat() genes_df <- as.data.frame(genes_df) n_genes <- nrow(genes_df) # accumulator vector (for each gene) for the total count over all cells in X(\"raw\") raw_sum_by_gene <- numeric(n_genes) names(raw_sum_by_gene) <- genes_df$feature_id # iterate through in-memory shards of query results tables <- query$X(\"raw\")$tables() while (!tables$read_complete()) { table_part <- tables$read_next() # table_part is an Arrow table with the columns mentioned above. The result # order is not guaranteed! # table_part$soma_dim_1 is the var/gene soma_joinid. But note that these are # arbitrary int64 id's, and moreover each table_part may exhibit only a subset # of the values we'll see over all query results. query$indexer helps us map # any given soma_dim_1 values onto positions in query$var() (genes_df), that is # the union of all values we'll see. gene_indexes <- query$indexer$by_var(table_part$soma_dim_1)$as_vector() stopifnot(sum(gene_indexes >= n_genes) == 0) # sum(table_part) group by gene, yielding a numeric vector with the gene_index # in its names sum_part <- tapply(as.vector(table_part$soma_data), gene_indexes, sum) # update the accumulator vector which_genes <- as.integer(names(sum_part)) + 1 # nb: gene_indexes is zero-based stopifnot(sum(which_genes > n_genes) == 0) raw_sum_by_gene[which_genes] <- raw_sum_by_gene[which_genes] + sum_part } # Divide each sum by cell count to get mean reads per cell (for each gene), # implicitly averaging in all zero entries in X even though they weren't included # in the sparse query results. genes_df$raw_mean <- raw_sum_by_gene / query$n_obs genes_df #> feature_id feature_name raw_mean #> 1 ENSMUSG00000051951 Xkr4 1.283861e+00 #> 2 ENSMUSG00000089699 Gm1992 0.000000e+00 #> 3 ENSMUSG00000102343 Gm37381 0.000000e+00 #> 4 ENSMUSG00000025900 Rp1 2.914160e-01 #> 5 ENSMUSG00000025902 Sox17 6.074154e+01 #> 6 ENSMUSG00000104328 Gm37323 5.701742e-05 #> 7 ENSMUSG00000033845 Mrpl15 3.621738e+01 #> 8 ENSMUSG00000025903 Lypla1 1.827366e+01 #> 9 ENSMUSG00000104217 Gm37988 0.000000e+00 #> 10 ENSMUSG00000033813 Tcea1 3.960339e+01 #> 11 ENSMUSG00000002459 Rgs20 3.212989e+00 #> 12 ENSMUSG00000085623 Gm16041 5.701742e-05 #> 13 ENSMUSG00000033793 Atp6v1h 6.866793e+01 #> 14 ENSMUSG00000025905 Oprk1 4.198763e-01 #> 15 ENSMUSG00000033774 Npbwr1 1.140348e-04 #> 16 ENSMUSG00000025907 Rb1cc1 3.342227e+01 #> 17 ENSMUSG00000090031 4732440D04Rik 1.317102e-02 #> 18 ENSMUSG00000087247 Alkal1 5.701742e-05 #> 19 ENSMUSG00000033740 St18 1.525501e+01 #> 20 ENSMUSG00000051285 Pcmtd1 4.981224e+01 #> 21 ENSMUSG00000097797 Gm26901 4.276306e-04 #> 22 ENSMUSG00000103067 Gm30414 0.000000e+00 #> 23 ENSMUSG00000025909 Sntg1 1.083131e+00 #> 24 ENSMUSG00000061024 Rrs1 1.929504e+01 #> 25 ENSMUSG00000025911 Adhfe1 1.163515e+01 #> 26 ENSMUSG00000067879 Vxn 9.911623e+00 #> 27 ENSMUSG00000099827 Gm29520 0.000000e+00 #> 28 ENSMUSG00000025912 Mybl1 2.439775e-01 #> 29 ENSMUSG00000045210 Vcpip1 3.177732e+01 #> 30 ENSMUSG00000097893 1700034P13Rik 5.257291e-01 #> 31 ENSMUSG00000025915 Sgk3 1.850455e+01 #> 32 ENSMUSG00000046101 Mcmdc2 6.555578e-01 #> 33 ENSMUSG00000098234 Snhg6 6.245488e+00 #> [ reached 'max' / getOption(\"max.print\") -- omitted 52359 rows ]"},{"path":"/articles/census_compute_over_X.html","id":"counting-cells-grouped-by-dataset-and-gene","dir":"Articles","previous_headings":"","what":"Counting cells grouped by dataset and gene","title":"Computing on X using online (incremental) algorithms","text":"goal example count number cells nonzero reads, grouped gene Census dataset_id. result data frame dataset, gene, number cells nonzero reads dataset gene. multi-factor aggregation, ’ll take advantage dplyr routines instead lower-level vector indexer shown . presentation purposes, ’ll limit query four genes, can expanded genes easily. Don’t forget close census.","code":"library(\"dplyr\") query <- census$get(\"census_data\")$get(\"mus_musculus\")$axis_query( measurement_name = \"RNA\", obs_query = SOMAAxisQuery$new(value_filter = \"tissue=='brain'\"), var_query = SOMAAxisQuery$new(value_filter = \"feature_name %in% c('Malat1', 'Ptprd', 'Dlg2', 'Pcdh9')\") ) obs_tbl <- query$obs(column_names=c(\"soma_joinid\", \"dataset_id\"))$concat() obs_df <- data.frame( # materialize soma_joinid as character to avoid overflowing R 32-bit integer cell_id = as.character(obs_tbl$soma_joinid), dataset_id = obs_tbl$dataset_id$as_vector() ) var_tbl <- query$var(column_names=c(\"soma_joinid\", \"feature_name\"))$concat() var_df <- data.frame( gene_id = as.character(var_tbl$soma_joinid), feature_name = var_tbl$feature_name$as_vector() ) # accumulator for # cells by dataset & gene n_cells_grouped <- data.frame( \"dataset_id\" = character(0), \"gene_id\" = character(0), \"n_cells\" = numeric(0) ) # iterate through in-memory shards of query results tables <- query$X(\"raw\")$tables() while (!tables$read_complete()) { table_part <- tables$read_next() # prepare a (dataset,gene,1) tuple for each entry in table_part n_cells_part <- data.frame( \"cell_id\" = as.character(table_part$soma_dim_0), \"gene_id\" = as.character(table_part$soma_dim_1), \"n_cells\" = 1 ) n_cells_part <- left_join(n_cells_part, obs_df, by = \"cell_id\") stopifnot(sum(is.null(n_cells_part$dataset_id)) == 0) # fold those into n_cells_grouped n_cells_grouped <- n_cells_part %>% select(-cell_id) %>% bind_rows(n_cells_grouped) %>% group_by(dataset_id, gene_id) %>% summarise(n_cells = sum(n_cells)) %>% ungroup() } # add gene names for display n_cells_grouped <- left_join(n_cells_grouped, var_df, by = \"gene_id\") stopifnot(sum(is.null(n_cells_grouped$feature_name)) == 0) n_cells_grouped[c(\"dataset_id\", \"feature_name\", \"n_cells\")] #> # A tibble: 17 × 3 #> dataset_id feature_name n_cells #>