BioJulia · rasmushenningsson · Jun 20, 2024 · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,30 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.4.0] - 2024-06-20
+
+### Breaking
+
+* DataMatrix will now always use the first column of var/obs annotations as ID. (Multiple ID columns are no longer supported.)
+* `load_counts` - The default obs ID column name is now "cell_id" (was "id" before).
+* `load10x` - default to using only first column (id) as unique identifier. Specify e.g. `var_id="var_id"=>["id", "feature_type"]` to merge multiple columns to create the ID.
+* `load10x` - default to using first column (barcode) as unique identifier.
+* `load10x` - no longer supports `copy_obs_col` kwarg.
+* `set_var_id_cols!` is replaced with `set_var_id_col!` (since there is only one ID column).
+* `set_obs_id_cols!` is replaced with `set_obs_id_col!` (since there is only one ID column).
+* Update to SCTransform 0.2, which handles `logcellcounts` better when there are multiple modalities (e.g. RNA and antibody counts) present in the data.
+
+### Added
+
+* `var_counts_fraction` - Just like `var_counts_fraction!`, but not modifying the object in place.
+* `var_counts_sum` and `var_counts_sum!` - For summing over selected variables. Useful for counting e.g. total RNA expression and finding number of expressed features.
+* Added support for using external annotations where applicable (filter, transforms, normalization, statistical tests, var_counts_fraction!, var_counts_sum!)
+* Added experimental (thus yet unexported) `Annotations` struct, that wraps a `DataFrame` with IDs in the first column, and ensures that ID remain when accessing columns. (So that the resulting object can be leftjoined to `data.obs`/`data.var`.)
+
+### Fixed
+
 * Add compat for weakdeps (UMAP, TSne, PrincipalMomentAnalysis).
+* SVDModel now only stores `U` and `S` since `V` is not needed for projection.
 
 ## [0.3.9] - 2024-03-04
 

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SingleCellProjections"
 uuid = "03d38035-ed2f-4a36-82eb-797f1727ab2e"
 authors = ["Rasmus Henningsson <[email protected]>"]
-version = "0.3.9"
+version = "0.4.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -49,7 +49,7 @@ PrecompileTools = "1"
 PrincipalMomentAnalysis = "0.2"
 Random = "1"
 Requires = "1.2"
-SCTransform = "0.1"
+SCTransform = "0.2"
 SingleCell10x = "0.1, 0.2"
 SparseArrays = "1"
 StableRNGs = "1"

diff --git a/ext/SingleCellProjectionsPrincipalMomentAnalysisExt.jl b/ext/SingleCellProjectionsPrincipalMomentAnalysisExt.jl
@@ -87,7 +87,7 @@ See also: [`PrincipalMomentAnalysis.pma`](https://principalmomentanalysis.github
 """
 function PrincipalMomentAnalysis.pma(data::DataMatrix, args...; nsv=3, var=:copy, obs=:copy, kwargs...)
 	F = implicitpma(data.matrix, args...; nsv=nsv, kwargs...)
-	model = PMAModel(F,select(data.var,data.var_id_cols), var, obs)
+	model = PMAModel(F,select(data.var,1), var, obs)
 	update_matrix(data, F, model; model.var, model.obs)
 end
 

diff --git a/ext/SingleCellProjectionsUMAPExt.jl b/ext/SingleCellProjectionsUMAPExt.jl
@@ -28,7 +28,7 @@ The other `args...` and `kwargs...` are forwarded to `UMAP.umap`. See `UMAP` doc
 See also: [`UMAP.umap`](https://github.com/dillondaudert/UMAP.jl)
 """
 function UMAP.umap(data::DataMatrix, args...; obs=:copy, kwargs...)
-	model = UMAPModel(UMAP.UMAP_(obs_coordinates(data), args...; kwargs...), select(data.var,data.var_id_cols), obs)
+	model = UMAPModel(UMAP.UMAP_(obs_coordinates(data), args...; kwargs...), select(data.var,1), obs)
 	update_matrix(data, model.m.embedding, model; var="UMAP", model.obs)
 end
 

diff --git a/src/SingleCellProjections.jl b/src/SingleCellProjections.jl
@@ -12,10 +12,11 @@ export
 	NearestNeighborModel,
 	ObsAnnotationModel,
 	VarCountsFractionModel,
+	VarCountsSumModel,
 	PseudoBulkModel,
 	project,
-	set_var_id_cols!,
-	set_obs_id_cols!,
+	set_var_id_col!,
+	set_obs_id_col!,
 	var_coordinates,
 	obs_coordinates,
 	load10x,
@@ -39,6 +40,9 @@ export
 	var_to_obs,
 	var_to_obs_table,
 	var_counts_fraction!,
+	var_counts_fraction,
+	var_counts_sum!,
+	var_counts_sum,
 	pseudobulk,
 	local_outlier_factor!,
 	local_outlier_factor_projection!,
@@ -92,6 +96,9 @@ include("sctransformsparse.jl")
 
 include("implicitsvd.jl")
 
+include("annotations.jl")
+include("annotation_utils.jl")
+
 include("lowrank.jl")
 include("projectionmodels.jl")
 include("datamatrix.jl")
@@ -116,6 +123,7 @@ include("reduce.jl")
 include("annotate.jl")
 include("statistical_tests.jl")
 include("counts_fraction.jl")
+include("counts_sum.jl")
 include("pseudobulk.jl")
 
 include("local_outlier_factor.jl")

diff --git a/src/adjacency_matrices.jl b/src/adjacency_matrices.jl
@@ -44,12 +44,12 @@ end
 function knn_adjacency_matrix(data::DataMatrix; kwargs...)
 	adj = knn_adjacency_matrix(obs_coordinates(data.matrix); kwargs...)
 	obs = copy(data.obs)
-	DataMatrix(adj, obs, obs; var_id_cols=data.obs_id_cols, data.obs_id_cols)
+	DataMatrix(adj, obs, obs)
 end
 
 function knn_adjacency_matrix(X::DataMatrix, Y::DataMatrix; kwargs...)
 	adj = knn_adjacency_matrix(obs_coordinates(X.matrix), obs_coordinates(Y.matrix); kwargs...)
-	DataMatrix(adj, copy(X.obs), copy(Y.obs); var_id_cols=X.obs_id_cols, Y.obs_id_cols)
+	DataMatrix(adj, copy(X.obs), copy(Y.obs))
 end
 
 
@@ -69,10 +69,10 @@ At the moment all points in `Y` are required to have the same number of neighbor
 for computation reasons.
 """
 function adjacency_distances(adj::DataMatrix, X::DataMatrix, Y::DataMatrix=X)
-	table_cols_equal(adj.var, X.obs; cols=X.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.")
-	table_cols_equal(adj.obs, Y.obs; cols=Y.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.")
+	table_cols_equal(adj.var, X.obs; cols=names(X.obs,1)) || error("Adjacency matrix and DataMatrix have different obs.")
+	table_cols_equal(adj.obs, Y.obs; cols=names(Y.obs,1)) || error("Adjacency matrix and DataMatrix have different obs.")
 	D = _adjacency_distances(adj.matrix, X, Y)
-	DataMatrix(D, copy(adj.var), copy(adj.obs); adj.var_id_cols, adj.obs_id_cols)
+	DataMatrix(D, copy(adj.var), copy(adj.obs))
 end
 
 
@@ -98,7 +98,7 @@ function _adjacency_distances(adj, X::DataMatrix, Y::DataMatrix=X)
 
 		# Xs = X[:,Is] # Doesn't work since DataMatrix doesn't allow duplicate IDs
 		# Temporary workaround - TODO: fix proper interface?
-		Xs = DataMatrix(_subsetmatrix(X.matrix,:,Is), X.var, DataFrame(id=1:length(Is)); var_id_cols=X.var_id_cols)
+		Xs = DataMatrix(_subsetmatrix(X.matrix,:,Is), X.var, DataFrame(id=1:length(Is)))
 
 
 		# Ys = Y[:,Js] # guaranteed to be equal to Y

diff --git a/src/annotate.jl b/src/annotate.jl
@@ -32,13 +32,13 @@ function ObsAnnotationModel(fvar, data::DataMatrix;
 
 	# kwargs trick to let defaults be decided here if `nothing` is passed to name_src or names
 	if names === nothing
-		name_src = @something name_src _get_name_src(fvar) data.var_id_cols
+		name_src = @something name_src _get_name_src(fvar) Base.names(data.var,1)
 		names = _default_out_name(name_src)
 	end
 
 	var_ind = _filter_indices(data.var, fvar)
 	v = data.var[var_ind,:]
-	var_match = select(v, data.var_id_cols; copycols=false)
+	var_match = select(v, 1; copycols=false)
 	isempty(var_match) && throw(ArgumentError("No variables match filter ($fvar)."))
 	ObsAnnotationModel(var_match, instantiate_out_names(v, names), var, obs, matrix)
 end
@@ -88,7 +88,7 @@ end
 function var_to_obs_table(fvar, data; kwargs...)
 	model = ObsAnnotationModel(fvar, data; kwargs...)
 	new_obs = _new_annot(data, model)
-	hcat(select(data.obs, data.obs_id_cols), new_obs)
+	hcat(select(data.obs, 1), new_obs)
 end
 
 

diff --git a/src/annotation_utils.jl b/src/annotation_utils.jl
@@ -0,0 +1,17 @@
+find_annotation(::String, ::Nothing) = nothing
+function find_annotation(name::String, df::DataFrame)
+	hasproperty(df, name) || return nothing
+	select(df, [only(names(df,1)), name]; copycols=false)
+end
+function find_annotation(name::String, a::Annotations)
+	x = get(a, name, nothing)
+	x !== nothing ? get_table(x) : nothing
+end
+
+function find_annotation(name::String, annot::AbstractVector)
+	for a in annot
+		x = find_annotation(name, a)
+		x !== nothing && return x
+	end
+	nothing
+end
diff --git a/src/annotations.jl b/src/annotations.jl
@@ -0,0 +1,60 @@
+# NB: Annotations is considered experimental API and thus not exported.
+#     It may get breaking changes in minor/patch releases.
+struct Annotations
+	df::DataFrame # implementation detail, might be changed later. The first column is the ID column, and the name of that column is the name of the axis.
+end
+
+get_table(a::Annotations) = getfield(a,:df)
+
+
+Base.haskey(a::Annotations, name::String) = hasproperty(get_table(a), name)
+
+
+function Base.get(f::Union{Type,Function}, a::Annotations, column::String)
+	df = get_table(a)
+	hasproperty(df, column) || return f()
+	id_column = names(df, 1)
+	cols = only(id_column) == column ? id_column : vcat(id_column,column)
+	Annotations(select(df, cols; copycols=false))
+end
+Base.get(a::Annotations, column::String, default) = get(()->default, a, column)
+Base.get(f::Union{Type,Function}, a::Annotations, column::Symbol) = get(f, a, String(column))
+Base.get(a::Annotations, column::Symbol, default) = get(a,String(column), default)
+
+
+Base.getindex(a::Annotations, column::Union{Symbol,String}) = get(()->throw(KeyError(column)), a, column)
+
+function Base.getindex(a::Annotations, columns::AbstractVector{String})
+	df = get_table(a)
+	for column in columns
+		hasproperty(df, column) || throw(KeyError(column))
+	end
+
+	id_column = names(df,1)
+	id_ind = findfirst(isequal(only(id_column)), columns)
+	if id_ind !== nothing # ID column present? Move it first and keep the relative order between the others.
+		cols = append!(id_column, @view(columns[1:id_ind-1]))
+		cols = append!(id_column, @view(columns[id_ind+1:end]))
+	else # ID column not present? Add it to the beginning.
+		cols = append!(id_column, columns)
+	end
+	Annotations(select(df,cols; copycols=false))
+end
+Base.getindex(a::Annotations, columns::AbstractVector{<:Union{Symbol,String}}) = a[String.(columns)]
+
+
+Base.propertynames(a::Annotations, private::Bool) = propertynames(get_table(a), private)
+Base.getproperty(a::Annotations, column::Symbol) = a[column]
+Base.getproperty(a::Annotations, column::String) = a[column]
+
+function annotation_name(a::Annotations)
+	df = get_table(a)
+	@assert size(df,2) == 2 "Expected annotations object to have an ID column and a single data column, got columns: $(names(df))"
+	only(names(df,2))
+end
+
+# function annotation_values(a::Annotations)
+# 	df = get_table(a)
+# 	@assert size(df,2) == 2 "Expected annotations object to have an ID column and a single data column, got columns: $(names(df))"
+# 	df[!,2]
+# end
diff --git a/src/counts_fraction.jl b/src/counts_fraction.jl
@@ -14,7 +14,7 @@ function VarCountsFractionModel(counts::DataMatrix{<:AbstractMatrix{<:Integer}},
 	sub_ind = _filter_indices(var_annot, sub_filter)
 	tot_ind = _filter_indices(var_annot, tot_filter)
 
-	var_id = select(var_annot, counts.var_id_cols)
+	var_id = select(var_annot, 1)
 	var_match_sub = var_id[sub_ind, :]
 	var_match_tot = var_id[tot_ind, :]
 
@@ -39,7 +39,7 @@ update_model(m::VarCountsFractionModel; col=m.col, var=m.var, obs=m.obs, matrix=
 
 # TODO: make general table utility function?
 function _matching_var_mask(v, sub)
-	bad_ind = findfirst(isnothing, table_indexin(sub,v; cols=	names(sub)))
+	bad_ind = findfirst(isnothing, table_indexin(sub,v; cols=names(sub)))
 	if bad_ind !== nothing
 		error("Row with contents (", join(sub[bad_ind,:],","), ") not found in var.")
 	end
@@ -64,12 +64,16 @@ end
 
 
 """
-	var_counts_fraction!(counts::DataMatrix, sub_filter, tot_filter, col; check=true)
+	var_counts_fraction!(counts::DataMatrix, sub_filter, tot_filter, col; check=true, var=:keep, obs=:keep)
 
 For each observation, compute the fraction of counts that match a specific variable pattern.
 * `sub_filter` decides which variables are counted.
 * `tot_filter` decides which variables to include in the total.
-* If `check=true`, an error will be thrown if no variables match the patterns.
+
+kwargs:
+* `var` - Use this to set `var` in the `ProjectionModel`.
+* `obs` - Use this to set `obs` in the `ProjectionModel`. Note that `counts.obs` is changed in place, regardless of the value of `obs`.
+If `check=true`, an error will be thrown if no variables match the patterns.
 
 For more information on filtering syntax, see examples below and the documentation on [`DataFrames.filter`](https://dataframes.juliadata.org/stable/lib/functions/#Base.filter).
 
@@ -78,13 +82,15 @@ Examples
 
 Compute the fraction of reads in MT- genes, considering only "Gene Expression" features (and not e.g. "Antibody Capture").
 ```
-var_counts_fraction!(counts, "name"=>contains(r"^MT-"), "feature_type"=>isequal("Gene Expression"), "fraction_mt")
+var_counts_fraction!(counts, "name"=>startswith("MT-"), "feature_type"=>isequal("Gene Expression"), "fraction_mt")
 ```
 
 Compute the fraction of reads in MT- genes, when there is no `feature_type` annotation (i.e. all variables are genes).
 ```
-var_counts_fraction!(counts, "name"=>contains(r"^MT-"), Returns(true), "fraction_mt")
+var_counts_fraction!(counts, "name"=>startswith("MT-"), Returns(true), "fraction_mt")
 ```
+
+See also: [`var_counts_fraction`](@ref)
 """
 function var_counts_fraction!(counts::DataMatrix{<:AbstractMatrix{<:Integer}}, args...; kwargs...)
 	model = VarCountsFractionModel(counts, args...; var=:keep, obs=:keep, matrix=:keep, kwargs...)
@@ -93,6 +99,43 @@ function var_counts_fraction!(counts::DataMatrix{<:AbstractMatrix{<:Integer}}, a
 	counts
 end
 
+
+"""
+	var_counts_fraction(counts::DataMatrix, sub_filter, tot_filter, col; check=true, var=:copy, obs=:copy)
+
+For each observation, compute the fraction of counts that match a specific variable pattern.
+* `sub_filter` decides which variables are counted.
+* `tot_filter` decides which variables to include in the total.
+
+kwargs:
+* `var` - Can be `:copy` (make a copy of source `var`) or `:keep` (share the source `var` object).
+* `obs` - Can be `:copy` (make a copy of source `obs`) or `:keep` (share the source `obs` object).
+If `check=true`, an error will be thrown if no variables match the patterns.
+
+For more information on filtering syntax, see examples below and the documentation on [`DataFrames.filter`](https://dataframes.juliadata.org/stable/lib/functions/#Base.filter).
+
+Examples
+=========
+
+Compute the fraction of reads in MT- genes, considering only "Gene Expression" features (and not e.g. "Antibody Capture").
+```
+var_counts_fraction(counts, "name"=>startswith("MT-"), "feature_type"=>isequal("Gene Expression"), "fraction_mt")
+```
+
+Compute the fraction of reads in MT- genes, when there is no `feature_type` annotation (i.e. all variables are genes).
+```
+var_counts_fraction(counts, "name"=>startswith("MT-"), Returns(true), "fraction_mt")
+```
+
+See also: [`var_counts_fraction!`](@ref)
+"""
+function var_counts_fraction(counts::DataMatrix{<:AbstractMatrix{<:Integer}}, args...; kwargs...)
+	model = VarCountsFractionModel(counts, args...; var=:copy, obs=:copy, matrix=:keep, kwargs...)
+	project(counts, model)
+end
+
+
+
 function project_impl(counts::DataMatrix{<:AbstractMatrix{<:Integer}}, model::VarCountsFractionModel; verbose=true)
 	frac = _var_counts_fraction(counts, model)