diff --git a/src/power_divergence.jl b/src/power_divergence.jl index baa2cbf3..2b4ddd9e 100644 --- a/src/power_divergence.jl +++ b/src/power_divergence.jl @@ -38,7 +38,8 @@ function testname(x::PowerDivergenceTest) end # parameter of interest: name, value under h0, point estimate -population_param_of_interest(x::PowerDivergenceTest) = ("Multinomial Probabilities", x.theta0, x.thetahat) +population_param_of_interest(x::PowerDivergenceTest) = ("Multinomial Probabilities", + x.theta0, x.thetahat) default_tail(test::PowerDivergenceTest) = :right pvalue(x::PowerDivergenceTest; tail=:right) = pvalue(Chisq(x.df),x.stat; tail=tail) @@ -49,8 +50,8 @@ pvalue(x::PowerDivergenceTest; tail=:right) = pvalue(Chisq(x.df),x.stat; tail=ta Compute a confidence interval with coverage `level` for multinomial proportions using one of the following methods. Possible values for `method` are: - - `:auto` (default): If the minimum of the expected cell counts exceeds 100, Quesenberry-Hurst - intervals are used, otherwise Sison-Glaz. + - `:auto` (default): If the minimum of the expected cell counts exceeds 100, + Quesenberry-Hurst intervals are used, otherwise Sison-Glaz. - `:sison_glaz`: Sison-Glaz intervals - `:bootstrap`: Bootstrap intervals - `:quesenberry_hurst`: Quesenberry-Hurst intervals @@ -59,8 +60,9 @@ one of the following methods. Possible values for `method` are: # References * Agresti, Alan. Categorical Data Analysis, 3rd Edition. Wiley, 2013. - * Sison, C.P and Glaz, J. Simultaneous confidence intervals and sample size determination - for multinomial proportions. Journal of the American Statistical Association, + * Sison, C.P and Glaz, J. Simultaneous confidence intervals and sample size + determination for multinomial proportions. Journal of the American Statistical + Association, 90:366-369, 1995. * Quesensberry, C.P. and Hurst, D.C. Large Sample Simultaneous Confidence Intervals for Multinational Proportions. Technometrics, 6:191-195, 1964. @@ -102,8 +104,10 @@ end # Bootstrap function ci_bootstrap(x::PowerDivergenceTest,alpha::Float64, iters::Int64) - m = mapslices(x -> quantile(x, [alpha / 2, 1 - alpha / 2]), rand(Multinomial(x.n, convert(Vector{Float64}, x.thetahat)),iters) / x.n, dims=2) - Tuple{Float64,Float64}[(boundproportion(m[i,1]), boundproportion(m[i,2])) for i in 1:length(x.thetahat)] + m = mapslices(x -> quantile(x, [alpha / 2, 1 - alpha / 2]), rand(Multinomial(x.n, + convert(Vector{Float64}, x.thetahat)),iters) / x.n, dims=2) + Tuple{Float64,Float64}[(boundproportion(m[i,1]), boundproportion(m[i,2])) for i in + 1:length(x.thetahat)] end # Quesenberry, Hurst (1964) @@ -111,19 +115,24 @@ function ci_quesenberry_hurst(x::PowerDivergenceTest, alpha::Float64; GC::Bool=t m = length(x.thetahat) cv = GC ? quantile(Chisq(1), 1 - alpha / m) : quantile(Chisq(m - 1), 1 - alpha) - u = (cv .+ 2 .* x.observed .+ sqrt.(cv .* (cv .+ 4 .* x.observed .* (x.n .- x.observed) ./ x.n))) ./ (2 .* (x.n .+ cv)) - l = (cv .+ 2 .* x.observed .- sqrt.(cv .* (cv .+ 4 .* x.observed .* (x.n .- x.observed) ./ x.n))) ./ (2 .* (x.n .+ cv)) + u = (cv .+ 2 .* x.observed .+ sqrt.(cv .* (cv .+ 4 .* x.observed .* (x.n .- + x.observed) ./ x.n))) ./ (2 .* (x.n .+ cv)) + l = (cv .+ 2 .* x.observed .- sqrt.(cv .* (cv .+ 4 .* x.observed .* (x.n .- + x.observed) ./ x.n))) ./ (2 .* (x.n .+ cv)) Tuple{Float64,Float64}[(boundproportion(l[j]), boundproportion(u[j])) for j in 1:m] end # asymptotic simultaneous confidence interval # Gold (1963) -function ci_gold(x::PowerDivergenceTest, alpha::Float64; correct::Bool=true, GC::Bool=true) +function ci_gold(x::PowerDivergenceTest, alpha::Float64; correct::Bool=true, + GC::Bool=true) m = length(x.thetahat) cv = GC ? quantile(Chisq(1), 1 - alpha / 2m) : quantile(Chisq(m - 1), 1 - alpha / 2) - u = [x.thetahat[j] + sqrt.(cv * x.thetahat[j] * (1 - x.thetahat[j]) / x.n) + ifelse(correct, inv(2x.n), 0) for j in 1:m] - l = [x.thetahat[j] - sqrt.(cv * x.thetahat[j] * (1 - x.thetahat[j]) / x.n) - ifelse(correct, inv(2x.n), 0) for j in 1:m] + u = [x.thetahat[j] + sqrt.(cv * x.thetahat[j] * (1 - x.thetahat[j]) / x.n) + + ifelse(correct, inv(2x.n), 0) for j in 1:m] + l = [x.thetahat[j] - sqrt.(cv * x.thetahat[j] * (1 - x.thetahat[j]) / x.n) - + ifelse(correct, inv(2x.n), 0) for j in 1:m] Tuple{Float64,Float64}[ (boundproportion(l[j]), boundproportion(u[j])) for j in 1:m] end @@ -172,7 +181,8 @@ function ci_sison_glaz(x::PowerDivergenceTest, alpha::Float64; skew_correct::Boo m1[i] = mu[1] m2[i] = mu[2] + mu[1] - mu[1]^2 m3[i] = mu[3] + mu[2] * (3 - 3mu[1]) + (mu[1] - 3mu[1]^2 + 2mu[1]^3) - m4[i] = mu[4] + mu[3] * (6 - 4mu[1]) + mu[2] * (7 - 12mu[1] + 6mu[1]^2) + mu[1] - 4mu[1]^2 + 6mu[1]^3 - 3mu[1]^4 + m4[i] = (mu[4] + mu[3] * (6 - 4mu[1]) + mu[2] * (7 - 12mu[1] + 6mu[1]^2) + + mu[1] - 4mu[1]^2 + 6mu[1]^3 - 3mu[1]^4) m5[i] = den end for i in 1:k @@ -183,7 +193,8 @@ function ci_sison_glaz(x::PowerDivergenceTest, alpha::Float64; skew_correct::Boo g1 = s3 / s2^(3/2) g2 = s4 / s2^2 - poly = 1 + g1 * (z^3 - 3z) / 6 + g2 * (z^4 - 6z^2 + 3) / 24 + g1^2 * (z^6 - 15z^4 + 45z^2 - 15) / 72 + poly = 1 + g1 * (z^3 - 3z) / 6 + g2 * (z^4 - 6z^2 + 3) / 24 + g1^2 * (z^6 - 15z^4 + + 45z^2 - 15) / 72 f = poly * exp(-z^2 / 2) / sqrt(2π) probx = prod(m5) @@ -220,9 +231,11 @@ function ci_sison_glaz(x::PowerDivergenceTest, alpha::Float64; skew_correct::Boo out[i,5] = min(x.thetahat[i] + cn + onen, 1) end if skew_correct - return Tuple{Float64,Float64}[(boundproportion(out[i,2]), boundproportion(out[i,3])) for i in 1:k] + return Tuple{Float64,Float64}[(boundproportion(out[i,2]), + boundproportion(out[i,3])) for i in 1:k] else - return Tuple{Float64,Float64}[(boundproportion(out[i,4]), boundproportion(out[i,5])) for i in 1:k] + return Tuple{Float64,Float64}[(boundproportion(out[i,4]), + boundproportion(out[i,5])) for i in 1:k] end end @@ -230,35 +243,43 @@ end # Cressie and Read 1984; Read and Cressie 1988 # lambda = 1: Pearson's Chi-square statistic # lambda -> 0: Converges to Likelihood Ratio test stat -# lambda -> -1: Converges to minimum discrimination information statistic (Gokhale, Kullback 1978) +# lambda -> -1: Converges to minimum discrimination information statistic (Gokhale, +# Kullback 1978) # lambda = -2: Neyman Modified chi-squared statistic (Neyman 1949) # lambda = -.5: Freeman-Tukey statistic (Freeman, Tukey 1950) -# Under regularity conditions, their asymptotic distributions are all the same (Drost 1989) +# Under regularity conditions, their asymptotic distributions are all the same (Drost +# 1989) # Chi-squared null approximation works best for lambda near 2/3 """ - PowerDivergenceTest(x[, y]; lambda = 1.0, theta0 = ones(length(x))/length(x)) + PowerDivergenceTest(x[, y]; lambda = 1.0, theta0 = ones(length(x))/length(x), ddof = + 0) Perform a Power Divergence test. If `y` is not given and `x` is a matrix with one row or column, or `x` is a vector, then a goodness-of-fit test is performed (`x` is treated as a one-dimensional contingency table). In this case, the hypothesis tested is whether the population probabilities equal -those in `theta0`, or are all equal if `theta0` is not given. +those in `theta0`, or are all equal if `theta0` is not given. + If `x` is a matrix with at least two rows and columns, it is taken as a two-dimensional -contingency table. Otherwise, `x` and `y` must be vectors of the same length. The contingency -table is calculated using the `counts` function from the `StatsBase` package. Then the power -divergence test is conducted under the null hypothesis that the joint distribution of the -cell counts in a 2-dimensional contingency table is the product of the row and column -marginals. +contingency table. Otherwise, `x` and `y` must be vectors of the same length. The +contingency table is calculated using the `counts` function from the `StatsBase` package. +Then the power divergence test is conducted under the null hypothesis that the joint +distribution of the cell counts in a 2-dimensional contingency table is the product of the +row and column marginals. Note that the entries of `x` (and `y` if provided) must be non-negative integers. -Computed confidence intervals by default are Quesenberry-Hurst intervals -if the minimum of the expected cell counts exceeds 100, and Sison-Glaz intervals otherwise. -See the [`confint(::PowerDivergenceTest)`](@ref) documentation for a list of -supported methods to compute confidence intervals. +The `ddof` parameter is the "delta degrees of freedom" adjustment to the number of degrees +of freedom used for calculation of p-values. The number of degrees of freedom is decreased +by `ddof`. + +Computed confidence intervals by default are Quesenberry-Hurst intervals if the minimum of +the expected cell counts exceeds 100, and Sison-Glaz intervals otherwise. See the +[`confint(::PowerDivergenceTest)`](@ref) documentation for a list of supported methods to +compute confidence intervals. The power divergence test is given by ```math @@ -284,31 +305,36 @@ Implements: [`pvalue`](@ref), [`confint(::PowerDivergenceTest)`](@ref) * Agresti, Alan. Categorical Data Analysis, 3rd Edition. Wiley, 2013. """ -function PowerDivergenceTest(x::AbstractMatrix{T}; lambda::U=1.0, theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} +function PowerDivergenceTest(x::AbstractMatrix{T}; lambda::U=1.0, theta0::Vector{U} = + ones(length(x))/length(x), ddof::Integer=0) where {T<:Integer,U<:AbstractFloat} nrows, ncols = size(x) n = sum(x) #validate date - (any(x .< 0) || any(!isfinite, x)) && throw(ArgumentError("all entries must be nonnegative and finite")) + (any(x .< 0) || any(!isfinite, x)) && throw(ArgumentError("all entries must be + nonnegative and finite")) n == 0 && throw(ArgumentError("at least one entry must be positive")) - (!isfinite(nrows) || !isfinite(ncols) || !isfinite(nrows*ncols)) && throw(ArgumentError("invalid number of rows or columns")) + (!isfinite(nrows) || !isfinite(ncols) || !isfinite(nrows*ncols)) && + throw(ArgumentError("invalid number of rows or columns")) if nrows > 1 && ncols > 1 rowsums = sum(x, dims=2) colsums = sum(x, dims=1) - df = (nrows - 1) * (ncols - 1) + df = (nrows - 1) * (ncols - 1) - ddof thetahat = x ./ n xhat = rowsums * colsums / n theta0 = xhat / n - V = Float64[(colsums[j]/n) * (rowsums[i]/n) * (1 - rowsums[i]/n) * (n - colsums[j]) for i in 1:nrows, j in 1:ncols] + V = Float64[(colsums[j]/n) * (rowsums[i]/n) * (1 - rowsums[i]/n) * (n - + colsums[j]) for i in 1:nrows, j in 1:ncols] elseif nrows == 1 || ncols == 1 - df = length(x) - 1 + df = length(x) - 1 - ddof xhat = reshape(n * theta0, size(x)) thetahat = x / n V = reshape(n .* theta0 .* (1 .- theta0), size(x)) - abs(1 - sum(theta0)) <= sqrt(eps()) || throw(ArgumentError("Probabilities must sum to one")) + abs(1 - sum(theta0)) <= sqrt(eps()) || + throw(ArgumentError("Probabilities must sum to one")) else throw(ArgumentError("Number of rows or columns must be at least 1")) end @@ -331,24 +357,29 @@ function PowerDivergenceTest(x::AbstractMatrix{T}; lambda::U=1.0, theta0::Vector stat *= 2 / (lambda * (lambda + 1)) end - PowerDivergenceTest(lambda, vec(theta0), stat, df, x, n, vec(thetahat), xhat, (x - xhat) ./ sqrt.(xhat), (x - xhat) ./ sqrt.(V)) + PowerDivergenceTest(lambda, vec(theta0), stat, df, x, n, vec(thetahat), xhat, (x - + xhat) ./ sqrt.(xhat), (x - xhat) ./ sqrt.(V)) end #convenience functions #PDT -function PowerDivergenceTest(x::AbstractVector{T}, y::AbstractVector{T}, levels::Levels{T}; lambda::U=1.0) where {T<:Integer,U<:AbstractFloat} +function PowerDivergenceTest(x::AbstractVector{T}, y::AbstractVector{T}, + levels::Levels{T}; lambda::U=1.0, ddof::Integer=0) where {T<:Integer,U<:AbstractFloat} d = counts(x, y, levels) - PowerDivergenceTest(d, lambda=lambda) + PowerDivergenceTest(d, lambda=lambda, ddof=ddof) end -function PowerDivergenceTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T; lambda::U=1.0) where {T<:Integer,U<:AbstractFloat} +function PowerDivergenceTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T; + lambda::U=1.0, ddof::Integer=0) where {T<:Integer,U<:AbstractFloat} d = counts(x, y, k) - PowerDivergenceTest(d, lambda=lambda) + PowerDivergenceTest(d, lambda=lambda, ddof=ddof) end -PowerDivergenceTest(x::AbstractVector{T}; lambda::U=1.0, theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} = - PowerDivergenceTest(reshape(x, length(x), 1), lambda=lambda, theta0=theta0) +PowerDivergenceTest(x::AbstractVector{T}; lambda::U=1.0, theta0::Vector{U} = + ones(length(x))/length(x), ddof::Integer=0) where {T<:Integer,U<:AbstractFloat} = + PowerDivergenceTest(reshape(x, length(x), 1), lambda=lambda, theta0=theta0, + ddof=ddof) #ChisqTest """ @@ -360,35 +391,40 @@ with ``λ = 1``). If `y` is not given and `x` is a matrix with one row or column, or `x` is a vector, then a goodness-of-fit test is performed (`x` is treated as a one-dimensional contingency table). In this case, the hypothesis tested is whether the population probabilities equal -those in `theta0`, or are all equal if `theta0` is not given. +those in `theta0`, or are all equal if `theta0` is not given. The `ddof` parameter is the +"delta degrees of freedom" adjustment to the number of degrees of freedom used for +calculation of p-values. The number of degrees of freedom is decreased by `ddof`. If `x` is a matrix with at least two rows and columns, it is taken as a two-dimensional -contingency table. Otherwise, `x` and `y` must be vectors of the same length. The contingency -table is calculated using `counts` function from the `StatsBase` package. Then the power -divergence test is conducted under the null hypothesis that the joint distribution of the -cell counts in a 2-dimensional contingency table is the product of the row and column -marginals. +contingency table. Otherwise, `x` and `y` must be vectors of the same length. The +contingency table is calculated using `counts` function from the `StatsBase` package. Then +the power divergence test is conducted under the null hypothesis that the joint +distribution of the cell counts in a 2-dimensional contingency table is the product of the +row and column marginals. Note that the entries of `x` (and `y` if provided) must be non-negative integers. Implements: [`pvalue`](@ref), [`confint`](@ref) """ -function ChisqTest(x::AbstractMatrix{T}) where T<:Integer - PowerDivergenceTest(x, lambda=1.0) +function ChisqTest(x::AbstractMatrix{T}; ddof::Integer=0) where T<:Integer + PowerDivergenceTest(x, lambda=1.0, ddof=ddof) end -function ChisqTest(x::AbstractVector{T}, y::AbstractVector{T}, levels::Levels{T}) where T<:Integer +function ChisqTest(x::AbstractVector{T}, y::AbstractVector{T}, levels::Levels{T}; + ddof::Integer=0) where T<:Integer d = counts(x, y, levels) - PowerDivergenceTest(d, lambda=1.0) + PowerDivergenceTest(d, lambda=1.0, ddof=ddof) end -function ChisqTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T) where T<:Integer +function ChisqTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T; + ddof::Integer=0) where T<:Integer d = counts(x, y, k) - PowerDivergenceTest(d, lambda=1.0) + PowerDivergenceTest(d, lambda=1.0, ddof=ddof) end -ChisqTest(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} = - PowerDivergenceTest(reshape(x, length(x), 1), lambda=1.0, theta0=theta0) +ChisqTest(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x); + ddof::Integer=0) where {T<:Integer,U<:AbstractFloat} = + PowerDivergenceTest(reshape(x, length(x), 1), lambda=1.0, theta0=theta0, ddof=ddof) #MultinomialLRTest """ @@ -403,11 +439,11 @@ table). In this case, the hypothesis tested is whether the population probabilit those in `theta0`, or are all equal if `theta0` is not given. If `x` is a matrix with at least two rows and columns, it is taken as a two-dimensional -contingency table. Otherwise, `x` and `y` must be vectors of the same length. The contingency -table is calculated using `counts` function from the `StatsBase` package. Then the power -divergence test is conducted under the null hypothesis that the joint distribution of the -cell counts in a 2-dimensional contingency table is the product of the row and column -marginals. +contingency table. Otherwise, `x` and `y` must be vectors of the same length. The +contingency table is calculated using `counts` function from the `StatsBase` package. Then +the power divergence test is conducted under the null hypothesis that the joint +distribution of the cell counts in a 2-dimensional contingency table is the product of the +row and column marginals. Note that the entries of `x` (and `y` if provided) must be non-negative integers. @@ -417,17 +453,20 @@ function MultinomialLRTest(x::AbstractMatrix{T}) where T<:Integer PowerDivergenceTest(x, lambda=0.0) end -function MultinomialLRTest(x::AbstractVector{T}, y::AbstractVector{T}, levels::Levels{T}) where T<:Integer +function MultinomialLRTest(x::AbstractVector{T}, y::AbstractVector{T}, + levels::Levels{T}) where T<:Integer d = counts(x, y, levels) PowerDivergenceTest(d, lambda=0.0) end -function MultinomialLRTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T) where T<:Integer +function MultinomialLRTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T) where + T<:Integer d = counts(x, y, k) PowerDivergenceTest(d, lambda=0.0) end -MultinomialLRTest(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} = +MultinomialLRTest(x::AbstractVector{T}, + theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} = PowerDivergenceTest(reshape(x, length(x), 1), lambda=0.0, theta0=theta0) function show_params(io::IO, x::PowerDivergenceTest, ident="") diff --git a/test/power_divergence.jl b/test/power_divergence.jl index 6c058b08..3a7507c3 100644 --- a/test/power_divergence.jl +++ b/test/power_divergence.jl @@ -198,4 +198,17 @@ MultinomialLRTest(x,y,(1:3,1:3)) d = [113997 1031298 334453 37471] PowerDivergenceTest(d) + +# Test ddof for the ChisqTest +x = [8,10,16,6] +probs = [0.15865525393145702, 0.341344746068543, 0.34134474606854304, 0.15865525393145702] +m = ChisqTest(x, probs, ddof=2) +@test pvalue(m) ≈ 0.17603510054227095 + +# Test ddof for the PowerDivergenceTest +x = [8,10,16,6] +probs = [0.15865525393145702, 0.341344746068543, 0.34134474606854304, 0.15865525393145702] +m = PowerDivergenceTest(x, lambda=1.0, theta0=probs, ddof=2) +@test pvalue(m) ≈ 0.17603510054227095 + end