diff --git a/Project.toml b/Project.toml index 01da655..0dbbc0b 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.5.6" [deps] AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DirectionalStatistics = "e814f24e-44b0-11e9-2fd5-aba2b6113d95" EcoBase = "a58aae7d-b440-5a11-b283-399458f99aac" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -24,6 +25,7 @@ RCall = "0.13" RecipesBase = "0.6, 0.7, 0.8, 1" Requires = "^1" StatsBase = "0.32, 0.33" +DirectionalStatistics = "0.1.0 - 1.0" julia = "^1" [extras] diff --git a/src/Diversity.jl b/src/Diversity.jl index 90202a7..fd24954 100644 --- a/src/Diversity.jl +++ b/src/Diversity.jl @@ -190,6 +190,17 @@ export hillnumber end # sub-module Hill +""" + Diversity.Testing + +Hypothesis tests for distance matrices. +""" +module Testing + +include("Testing.jl") +export dispersion, permutest, mantel +end + # Path into package path(path...; dir::String = "test") = joinpath(@__DIR__, "..", dir, path...) diff --git a/src/Testing.jl b/src/Testing.jl new file mode 100644 index 0000000..b5e287c --- /dev/null +++ b/src/Testing.jl @@ -0,0 +1,6 @@ + +using LinearAlgebra, Random, DirectionalStatistics,AxisArrays +import Statistics: mean +include("Testing/ANOVA.jl") +include("Testing/dispersion.jl") +include("Testing/permutation_tests.jl") diff --git a/src/Testing/ANOVA.jl b/src/Testing/ANOVA.jl new file mode 100644 index 0000000..f66eb2b --- /dev/null +++ b/src/Testing/ANOVA.jl @@ -0,0 +1,67 @@ + +# Compute F-statistic, given vector of vectors +#ANOVA +function f(R) + X = vcat(R...) + N = length(X) + nj = length.(R) + X̅ = mean(X) + X̅j = mean.(R) + k = length(R) + top = sum(nj .*(X̅j .- X̅) .^2) /(k-1) + bottom = sum([sum((R[i] .- X̅j[i]) .^2) for i in 1:k])/(N-k) + return top/bottom +end + +function f(R,N,nj,X̅,k) + X̅j = mean.(R) + top = 0.0 + bottom = 0.0 + @inbounds for i in 1:k + top += nj[i] * (X̅j[i] - X̅)^2 + for j in 1:nj[i] + bottom += (R[i][j] - X̅j[i])^2 + end + end + top/= (k-1) + bottom /= (N-k) + return top/bottom +end + +#Pairwise ANOVA +function f_pairs(R) + n = length(R) + F = zeros(n,n) + for j in 1:n-1 + for i in j+1:n + F[i,j] = f([R[i],R[j]]) + end + end + return F +end +function f_pairs(R ::Vector,N ::Array{Int},nj ::Matrix{Tuple},X̅ ::Matrix{Float64}) + n = length(R) + F = zeros(n,n) + for j in 1:n-1 + for i in j+1:n + F[i,j] = f([R[i],R[j]],N[i,j],nj[i,j],X̅[i,j],2) + end + end + return F +end +# get parameter values corresponding to each pairwise combination of groups +function get_pars(R) + n = length(R) + N_p = Array{Int}(undef,n,n) + nj_p= Array{Tuple}(undef,n,n) + X̅_p = Array{Float64}(undef,n,n) + + for j in 1:n-1 + for i in j+1:n + N_p[i,j] = length(R[i]) + length(R[j]) + nj_p[i,j] = (length(R[i]) , length(R[j])) + X̅_p[i,j] = mean(vcat(R[i]...,R[j]...)) + end + end + return N_p,nj_p,X̅_p +end diff --git a/src/Testing/dispersion.jl b/src/Testing/dispersion.jl new file mode 100644 index 0000000..83cc72c --- /dev/null +++ b/src/Testing/dispersion.jl @@ -0,0 +1,98 @@ +# structs + +struct Disp + F ::Float64 + pairwise_F ::Array{Float64} + medians ::Union{Tuple,Vector} + residuals ::Vector{Matrix{Float64}} + means ::Vector{Float64} + group ::Vector + levels ::Vector + group_residuals ::Vector +end +#Helper functions + +# calculate distances from medians +function get_residuals(x,c) + d = x .-c + sum(d .^2,dims =2) +end + +# doube center matrix +function double_center(D) + A = D .- mean(D,dims = 1) + return A .- mean(A,dims = 2) +end + +# calculate medians +# Get spatial median as a row vector +geo_median(X) = transpose(DirectionalStatistics.geometric_median([row for row in eachrow(X)])) +#apply `geo_median` function to get positive_inds and negative_inds median for each group +function geo_median(pco_space, group, positive_inds) + + spMedPos = [geo_median(pco_space[group .== g,positive_inds]) for g in unique(group)] + spMedNeg = [geo_median(pco_space[group .== g, .!positive_inds]) for g in unique(group)] + + return (spMedPos,spMedNeg ) +end +function geo_median(pco_space, group) + + return [geo_median(pco_space[group .== g,:]) for g in unique(group)] +end + +function dispersion(D,group;metric = false) + """ + Quantify the dispersion around the group spatial median for each group in D, according to the given distance metric. + Once X is converted to a distance matrix, it is then transformed by principal coordinate analysis before medians and + distances are calculated. The algorithm is based on Anderson (2006). + + The function accepts: + D: A dissimiarity or distance matrix + group: A vector containing group labels + metric: (key word) boolean value specifying whether or not the distance used used metric + + This function returns a named tuple containing: + F = Global F-Statistic + pairwise_F = pairwise F-statistics + medians = spatial (aka geometric) median of each group in the transformed coordinates + residuals = vector containing each observations distance from its group median + means = vector containing the means distance to median for each group + group = the original grouping vector + levels = unique items from 'group' + + Anderson, M.J. (2006) Distance-based tests for homogeneity of multivariate dispersions. Biometrics 62(1), 245--253. + """ + levels = sort(unique(group)) + + if metric == false + # Double center the matrix. + A = double_center(D .^2 ) + vals, vecs = eigen(Hermitian(-A/2)) + pco_space = vecs * diagm(sqrt.(abs.(vals))) + # record indices corresponding to postitive eigenvalues + positive_inds = real.(vals) .> 0.0 + med = geo_median(pco_space,ones(size(pco_space)[1]),positive_inds) + medians = geo_median(pco_space, group, positive_inds) + # calculate distances (to median) for "positive_inds" and "negative_inds" vectors separately. in orer to + # subtract "negative_inds" from "positive_inds". See Anderson (2006) for details. + dis_pos = [get_residuals(pco_space[group .== levels[i],positive_inds], medians[1][i]) for i in 1:length(levels)] + dis_neg = [get_residuals(pco_space[group .== levels[i], .!positive_inds], medians[2][i]) for i in 1:length(levels)] + # Where dis_neg > dis_pos, set residual = 0 by taking only the real part + # of the square root of a negative. + residuals = [(real.(sqrt.(Complex.(dis_pos[i] .- dis_neg[i]))) ) for i in 1:length(dis_pos)] + group_dis_pos = get_residuals(vcat(medians[1]...), med[1][1]) + group_dis_neg = get_residuals(vcat(medians[2]...), med[2][1]) + group_residuals = [(real.(sqrt.(Complex.(group_dis_pos[i] .- group_dis_neg[i]))) ) for i in 1:length(group_dis_pos)] + + else + medians = geo_median(D, group) + med = geo_median(D,ones(size(D)[1])) + residuals = [get_residuals(D[group .== levels[i],:], medians[i]) for i in 1:length(levels)] + group_residuals = vec(get_residuals(vcat(medians...), med[1])) + end + F = f(residuals) + F_pairs = f_pairs(residuals) + means = AxisArray(mean.(residuals),string.(levels)) + return Disp(F, F_pairs, medians, residuals, means,group, levels,group_residuals) +end + diff --git a/src/Testing/permutation_tests.jl b/src/Testing/permutation_tests.jl new file mode 100644 index 0000000..71624af --- /dev/null +++ b/src/Testing/permutation_tests.jl @@ -0,0 +1,107 @@ + +function permutest(disp ::Disp,n_perm = 10000) + """ + This function permutes the residuals returned by 'dispersion' to provide global and pairwise + P-values. Currently, no corrections are made for multiple testing. MultipleTesting.jl provides + this functionality, if desired + + The function accepts: + disp: The named tuple returned by 'dispersion' + n_perm: the number of permutations + + This function returns a named tuple containing: + P = Global P-value + pairwise_P = pairwise P-value + F = Global F-Statistic + pairwise_F = pairwise F-statistics + + https://github.com/juliangehring/MultipleTesting.jl + """ + # read in values from `disp` + F = disp.F + R = disp.residuals + F_pairs = disp.pairwise_F + group = disp.group + levels = disp.levels + level_names = string.(levels) + inds = [group .== level for level in levels] + n = length(levels) + + # pre-calculate values for ANOVA function + r = vcat(R...) + X = vcat(R...) + N = length(X) + nj = Tuple(length.(R)) + X̅ = mean(X) + k = length(R) + # generate empty containers for F values + perm = Vector{Float64}(undef,n_perm) + #rs = copy(R)#Vector{Vector}(undef,n) + # run permutation + @inbounds for p in 1:n_perm + shuffle!(r) + rs = [view(r,inds[i]) for i in 1:n] + + perm[p] =f(rs,N,nj,X̅,k) + end + ppairs =zeros(k,k) + + @inbounds for i in 1:k-1 + for j in i+1:k + r_1 = r[group .== levels[i]] + r_2 = r[group .== levels[j]] + pstat =permutest(r_1,r_2, F_pairs[j,i],n_perm) + ppairs[j,i] = pstat + end + end + # calculate P-values and return P and F values + P = sum(perm .> F)/n_perm + P_pairs = AxisArray(ppairs,level_names,level_names) + F_pairs = AxisArray(LowerTriangular(F_pairs), level_names,level_names) + + return (P =P,pairwise_P = P_pairs,F = F, pairwise_F =F_pairs) +end + +function permutest(r_1,r_2 ,F,n_perm = 1000) + r = vcat(r_1,r_2) + nj = (length(r_1),length(r_2)) + n = 2 + N = sum(nj) + # pre-calculate values for ANOVA function + X̅ = mean(r) + k = 2 + inds = (1:nj[1],1:nj[2]) + # generate empty containers for F values + perm = Vector{Float64}(undef,n_perm) + # run permutation + @inbounds for p in 1:n_perm + shuffle!(r) + rs = (view(r,inds[1]) ,view(r,inds[2]) ) + perm[p] =f(rs,N,nj,X̅,k) + end + P = sum(perm .> F)/n_perm + return P +end + + +get_tri(x) = [x[i,j] for i in 2:size(x)[1] for j in 1:i] +function mantel(x_1,x_2, n_perm = 1000) + """ + A permutation based test of the correlation between two matrices. Given two dissimilarity/distance matrices of equal size, calculates the P-value as + the probability of obtaining the same or greater correlation coefficient after permuting one of the matrices. + """ + x = get_tri(x_1) + y = get_tri(x_2) + perm_c = Vector{Float64}(undef,n_perm) + + c = cor(x,y) + @inbounds for i in 1:n_perm + shuffle!(y) + perm_c[i] = cor(x,y) + end + + sum(perm_c .> c)/n_perm +end + + +