diff --git a/Project.toml b/Project.toml index c8515eea..95303f20 100644 --- a/Project.toml +++ b/Project.toml @@ -2,7 +2,7 @@ name = "Associations" uuid = "614afb3a-e278-4863-8805-9959372b9ec2" authors = ["Kristian Agasøster Haaga ", "Tor Einar Møller ", "George Datseris "] repo = "https://github.com/kahaaga/Associations.jl.git" -version = "4.0.0" +version = "4.1.0" [deps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" diff --git a/README.md b/README.md index e83c836e..3a2ea39c 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![CI](https://github.com/juliadynamics/Associations.jl/workflows/CI/badge.svg)](https://github.com/JuliaDynamics/Associations.jl/actions) [![](https://img.shields.io/badge/docs-latest_tagged-blue.svg)](https://juliadynamics.github.io/Associations.jl/stable/) -[![](https://img.shields.io/badge/docs-dev_(master)-blue.svg)](https://juliadynamics.github.io/Associations.jl/dev/) -[![codecov](https://codecov.io/gh/JuliaDynamics/Associations.jl/branch/master/graph/badge.svg?token=0b71n6x6AP)](https://codecov.io/gh/JuliaDynamics/Associations.jl) +[![](https://img.shields.io/badge/docs-dev_(main)-blue.svg)](https://juliadynamics.github.io/Associations.jl/dev/) +[[![codecov](https://codecov.io/gh/JuliaDynamics/Associations.jl/branch/master/graph/badge.svg?token=0b71n6x6AP)](https://codecov.io/gh/JuliaDynamics/Associations.jl)](https://img.shields.io/codecov/c/github/juliadynamics/Associations.jl/main) [![DOI](https://zenodo.org/badge/135443027.svg)](https://zenodo.org/badge/latestdoi/135443027) Associations.jl is a package for quantifying associations, independence testing and causal inference. diff --git a/changelog.md b/changelog.md index 1fc38fac..1791be2f 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,15 @@ # Changelog +From version v4.0 onwards, this package has been renamed to to Associations.jl. + +# 4.1 + +- New association measure: `ChatterjeeCorrelation`. + +# 4.0 (package rename) + +The package has been renamed from CausalityTools.jl to Associations.jl. + ## 3.0 (new major release) This release contains several breaking changes. Any code from before v3.0 will need diff --git a/docs/refs.bib b/docs/refs.bib index 173429ce..acd26cdf 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -1289,4 +1289,37 @@ @article{Arnhold1999 pages={419--430}, year={1999}, publisher={Elsevier} +} + +@article{Chatterjee2021, + title={A new coefficient of correlation}, + author={Chatterjee, Sourav}, + journal={Journal of the American Statistical Association}, + volume={116}, + number={536}, + pages={2009--2022}, + year={2021}, + publisher={Taylor \& Francis} +} + +@article{Shi2022, + title={On the power of Chatterjee’s rank correlation}, + author={Shi, Hongjian and Drton, Mathias and Han, Fang}, + journal={Biometrika}, + volume={109}, + number={2}, + pages={317--333}, + year={2022}, + publisher={Oxford University Press} +} + +@article{Dette2013, + title={A Copula-Based Non-parametric Measure of Regression Dependence}, + author={Dette, Holger and Siburg, Karl F and Stoimenov, Pavel A}, + journal={Scandinavian Journal of Statistics}, + volume={40}, + number={1}, + pages={21--41}, + year={2013}, + publisher={Wiley Online Library} } \ No newline at end of file diff --git a/docs/src/associations.md b/docs/src/associations.md index 0d6a729f..22144a1f 100644 --- a/docs/src/associations.md +++ b/docs/src/associations.md @@ -114,6 +114,7 @@ CorrelationMeasure PearsonCorrelation PartialCorrelation DistanceCorrelation +ChatterjeeCorrelation ``` ## [Cross-map measures](@id cross_map_api) diff --git a/docs/src/examples/examples_associations.md b/docs/src/examples/examples_associations.md index 218cb6bc..09703c70 100644 --- a/docs/src/examples/examples_associations.md +++ b/docs/src/examples/examples_associations.md @@ -1697,4 +1697,32 @@ using Random; rng = Xoshiro(1234); x = rand(rng, 300); y = rand(rng, 300) .* sin.(x); z = rand(rng, 300) .* y; est = RMCD(r = 0.5) association(est, x, y), association(est, x, z), association(est, x, z, y) -``` \ No newline at end of file +``` + +## [[`ChatterjeeCorrelation`](@ref)](@id example_ChatterjeeCorrelation) + +```@example example_ChatterjeeCorrelation +using Associations +using Random; rng = Xoshiro(1234); +x = rand(rng, 120) +y = rand(rng, 120) .* sin.(x) +``` + +By construction, there will almust surely be no ties in `x`, so we can use the +fast estimate by setting `handle_ties == false`. + +```@example example_ChatterjeeCorrelation +association(ChatterjeeCorrelation(handle_ties = false), x, y) +``` + +If we have data where we know there are ties in the data, then +we should set `handle_ties == true`. + +```@example example_ChatterjeeCorrelation +w = rand(rng, 1:10, 120) # there will be some ties +z = rand(rng, 1:15, 120) .* sin.(w) # introduce some dependence +association(ChatterjeeCorrelation(handle_ties = true), w, z) +``` + + + diff --git a/docs/src/examples/examples_independence.md b/docs/src/examples/examples_independence.md index 7b651207..6db1be08 100644 --- a/docs/src/examples/examples_independence.md +++ b/docs/src/examples/examples_independence.md @@ -117,6 +117,26 @@ enough evidence in the data to suggest directional coupling. ## [[`SurrogateAssociationTest`](@ref)](@id examples_surrogatetest) +### [Chatterjee correlation](@id example_SurrogateAssociationTest_ChatterjeeCorrelation) + +```@example example_SurrogateAssociationTest_ChatterjeeCorrelation +using Associations +using Random; rng = Xoshiro(1234) +n = 1000 +x, y = rand(1:50, n), rand(1:50, n) +test = SurrogateAssociationTest(ChatterjeeCorrelation(), nshuffles = 19) +independence(test, x, y) +``` + +As expected, the test indicates that we can't reject independence. What happens if we introduce +a third variable that depends on `y`? + +```@example example_SurrogateAssociationTest_ChatterjeeCorrelation +z = rand(1:20, n) .* y +independence(test, y, z) +``` + +The test clearly picks up on the functional dependence. ### [Distance correlation](@id example_SurrogateAssociationTest_DistanceCorrelation) diff --git a/src/core.jl b/src/core.jl index 911f5a90..cd364de5 100644 --- a/src/core.jl +++ b/src/core.jl @@ -35,7 +35,9 @@ an [`AssociationMeasureEstimator`](@ref) to compute. | Type | [`AssociationMeasure`](@ref) | Pairwise | Conditional | | -------------------------- | ------------------------------------------- | :------: | :---------: | | Correlation | [`PearsonCorrelation`](@ref) | ✓ | ✖ | +| Correlation | [`PartialCorrelation`](@ref) | ✓ | ✓ | | Correlation | [`DistanceCorrelation`](@ref) | ✓ | ✓ | +| Correlation | [`ChatterjeeCorrelation`](@ref) | ✓ | ✖ | | Closeness | [`SMeasure`](@ref) | ✓ | ✖ | | Closeness | [`HMeasure`](@ref) | ✓ | ✖ | | Closeness | [`MMeasure`](@ref) | ✓ | ✖ | @@ -85,43 +87,44 @@ Concrete subtypes are given as input to [`association`](@ref). ## Concrete implementations -| AssociationMeasure | Estimators | -| :------------------------------------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [`PearsonCorrelation`](@ref) | Not required | -| [`DistanceCorrelation`](@ref) | Not required | -| [`PartialCorrelation`](@ref) | Not required | -| [`SMeasure`](@ref) | Not required | -| [`HMeasure`](@ref) | Not required | -| [`MMeasure`](@ref) | Not required | -| [`LMeasure`](@ref) | Not required | -| [`JointDistanceDistribution`](@ref) | Not required | -| [`PairwiseAsymmetricInference`](@ref) | [`RandomVectors`](@ref), [`RandomSegment`](@ref) | -| [`ConvergentCrossMapping`](@ref) | [`RandomVectors`](@ref), [`RandomSegment`](@ref) | -| [`MCR`](@ref) | Not required | -| [`RMCD`](@ref) | Not required | -| [`MIShannon`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref), [`KraskovStögbauerGrassberger1`](@ref), [`KraskovStögbauerGrassberger2`](@ref), [`GaoOhViswanath`](@ref), [`GaoKannanOhViswanath`](@ref), [`GaussianMI`](@ref) | -| [`MIRenyiJizba`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref) | -| [`MIRenyiSarbu`](@ref) | [`JointProbabilities`](@ref) | -| [`MITsallisFuruichi`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref) | -| [`MITsallisMartin`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref) | -| [`CMIShannon`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref), [`MIDecomposition`](@ref), [`GaussianCMI`](@ref), [`FPVP`](@ref), [`MesnerShalizi`](@ref), [`Rahimzamani`](@ref) | -| [`CMIRenyiSarbu`](@ref) | [`JointProbabilities`](@ref) | -| [`CMIRenyiJizba`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref) | -| [`CMIRenyiPoczos`](@ref) | [`PoczosSchneiderCMI`](@ref) | -| [`CMITsallisPapapetrou`](@ref) | [`JointProbabilities`](@ref) | -| [`TEShannon`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref), [`Zhu1`](@ref), [`Lindner`](@ref) | -| [`TERenyiJizba`](@ref) | [`JointProbabilities`](@ref) | -| [`PartialMutualInformation`](@ref) | [`JointProbabilities`](@ref) | -| [`JointEntropyShannon`](@ref) | [`JointProbabilities`](@ref) | -| [`JointEntropyRenyi`](@ref) | [`JointProbabilities`](@ref) | -| [`JointEntropyTsallis`](@ref) | [`JointProbabilities`](@ref) | -| [`ConditionalEntropyShannon`](@ref) | [`JointProbabilities`](@ref) | -| [`ConditionalEntropyTsallisAbe`](@ref) | [`JointProbabilities`](@ref) | -| [`ConditionalEntropyTsallisFuruichi`](@ref) | [`JointProbabilities`](@ref) | -| [`HellingerDistance`](@ref) | [`JointProbabilities`](@ref) | -| [`KLDivergence`](@ref) | [`JointProbabilities`](@ref) | -| [`RenyiDivergence`](@ref) | [`JointProbabilities`](@ref) | -| [`VariationDistance`](@ref) | [`JointProbabilities`](@ref) | +| AssociationMeasure | Estimators | +| :------------------------------------------ | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`PearsonCorrelation`](@ref) | Not required | +| [`DistanceCorrelation`](@ref) | Not required | +| [`PartialCorrelation`](@ref) | Not required | +| [`ChatterjeeCorrelation`](@ref) | Not required | +| [`SMeasure`](@ref) | Not required | +| [`HMeasure`](@ref) | Not required | +| [`MMeasure`](@ref) | Not required | +| [`LMeasure`](@ref) | Not required | +| [`JointDistanceDistribution`](@ref) | Not required | +| [`PairwiseAsymmetricInference`](@ref) | [`RandomVectors`](@ref), [`RandomSegment`](@ref) | +| [`ConvergentCrossMapping`](@ref) | [`RandomVectors`](@ref), [`RandomSegment`](@ref) | +| [`MCR`](@ref) | Not required | +| [`RMCD`](@ref) | Not required | +| [`MIShannon`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref), [`KraskovStögbauerGrassberger1`](@ref), [`KraskovStögbauerGrassberger2`](@ref), [`GaoOhViswanath`](@ref), [`GaoKannanOhViswanath`](@ref), [`GaussianMI`](@ref) | +| [`MIRenyiJizba`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref) | +| [`MIRenyiSarbu`](@ref) | [`JointProbabilities`](@ref) | +| [`MITsallisFuruichi`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref) | +| [`MITsallisMartin`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref) | +| [`CMIShannon`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref), [`MIDecomposition`](@ref), [`GaussianCMI`](@ref), [`FPVP`](@ref), [`MesnerShalizi`](@ref), [`Rahimzamani`](@ref) | +| [`CMIRenyiSarbu`](@ref) | [`JointProbabilities`](@ref) | +| [`CMIRenyiJizba`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref) | +| [`CMIRenyiPoczos`](@ref) | [`PoczosSchneiderCMI`](@ref) | +| [`CMITsallisPapapetrou`](@ref) | [`JointProbabilities`](@ref) | +| [`TEShannon`](@ref) | [`JointProbabilities`](@ref), [`EntropyDecomposition`](@ref), [`Zhu1`](@ref), [`Lindner`](@ref) | +| [`TERenyiJizba`](@ref) | [`JointProbabilities`](@ref) | +| [`PartialMutualInformation`](@ref) | [`JointProbabilities`](@ref) | +| [`JointEntropyShannon`](@ref) | [`JointProbabilities`](@ref) | +| [`JointEntropyRenyi`](@ref) | [`JointProbabilities`](@ref) | +| [`JointEntropyTsallis`](@ref) | [`JointProbabilities`](@ref) | +| [`ConditionalEntropyShannon`](@ref) | [`JointProbabilities`](@ref) | +| [`ConditionalEntropyTsallisAbe`](@ref) | [`JointProbabilities`](@ref) | +| [`ConditionalEntropyTsallisFuruichi`](@ref) | [`JointProbabilities`](@ref) | +| [`HellingerDistance`](@ref) | [`JointProbabilities`](@ref) | +| [`KLDivergence`](@ref) | [`JointProbabilities`](@ref) | +| [`RenyiDivergence`](@ref) | [`JointProbabilities`](@ref) | +| [`VariationDistance`](@ref) | [`JointProbabilities`](@ref) | """ abstract type AssociationMeasureEstimator end diff --git a/src/independence_tests/surrogate/SurrogateAssociationTest.jl b/src/independence_tests/surrogate/SurrogateAssociationTest.jl index 4afb14b4..31e5b633 100644 --- a/src/independence_tests/surrogate/SurrogateAssociationTest.jl +++ b/src/independence_tests/surrogate/SurrogateAssociationTest.jl @@ -151,6 +151,7 @@ end include("transferentropy.jl") include("crossmapping.jl") include("hlms_measure.jl") +include("chatterjee_correlation.jl") # Input checks function SurrogateAssociationTest(measure::T) where T <: MultivariateInformationMeasure diff --git a/src/independence_tests/surrogate/chatterjee_correlation.jl b/src/independence_tests/surrogate/chatterjee_correlation.jl new file mode 100644 index 00000000..132235ee --- /dev/null +++ b/src/independence_tests/surrogate/chatterjee_correlation.jl @@ -0,0 +1,19 @@ +function independence(test::SurrogateAssociationTest{<:ChatterjeeCorrelation}, + x::AbstractVector, y::AbstractVector) + (; est_or_measure, rng, surrogate, nshuffles) = test + + # Create a new instance of the measure with pre-allocated values. We can then + # re-use this struct to avoid excessive allocations. + m = est_or_measure # the old measure + measure = ChatterjeeCorrelation(x, y, rng = m.rng, lt = m.lt, handle_ties = m.handle_ties) + + Î = association(measure, x, y) + sx = surrogenerator(x, surrogate, rng) + Îs = zeros(nshuffles) + for b in 1:nshuffles + Îs[b] = association(est_or_measure, sx(), y) + end + p = count(Î .<= Îs) / nshuffles + + return SurrogateAssociationTestResult(2, Î, Îs, p, nshuffles) +end diff --git a/src/methods/correlation/chatterjee.jl b/src/methods/correlation/chatterjee.jl new file mode 100644 index 00000000..53991696 --- /dev/null +++ b/src/methods/correlation/chatterjee.jl @@ -0,0 +1,212 @@ +using Random + +export ChatterjeeCorrelation + + +# Based on the description in https://arxiv.org/pdf/2008.11619 +""" + ChatterjeeCorrelation <: CorrelationMeasure + ChatterjeeCorrelation(; handle_ties = true, rng = Random.default_rng()) + +The Chatterjee correlation measure [Chatterjee2021](@cite) is an asymmetric measure of dependence between +two variables. + +!!! info "Speeding up computations" + If `handle_ties == true`, then the first formula below is used. If you know + for sure that there are no ties in your data, then set `handle_ties == false`, + which will use the second (faster) formula below. + +!!! note "Randomization and reproducibility" + When rearranging the input datasets, the second variable `y` is sorted + according to a sorting of the first variable `x`. If `x` has ties, then + these ties are broken randomly and uniformly. For complete reproducibility in + this step, you can specify `rng`. If `x` has no ties, then no randomization + is performed. + +# Usage + +- Use with [`association`](@ref) to compute the raw Chatterjee correlation coefficient. +- Use with [`SurrogateAssociationTest`](@ref) to perform a surrogate test for significance + of a Chatterjee-type association ([example](@ref example_SurrogateAssociationTest_ChatterjeeCorrelation)). + When using a surrogate test for significance, the *first* input variable is shuffled + according to the given surrogate method. + +## Description + +The correlation statistic is defined as + +```math +\\epsilon_n(X, Y) = +1 - \\dfrac{n\\sum_{i=1}^{n-1} |r_{i+1} - r_i|}{2\\sum_{i=1}^n }. +``` + +When there are no ties among the ``Y_1, Y_2, \\ldots, Y_n``, the +measure is + +```math +\\epsilon_n(X, Y) = +1 - \\dfrac{3\\sum_{i=1}^{n-1} |r_{i+1} - r_i|}{n^2 - 1}. +``` + +This statistic estimates a quantity proposed by [Dette2013](@citet), as indicated in +[Shi2022](@citet). It can therefore also be called the Chatterjee-Dette-Siburg-Stoimenov +correlation coefficient. + +## Estimation + +- [Example 1](@ref example_ChatterjeeCorrelation). Estimating the Chatterjee correlation coefficient + for independent and for dependent variables. +- [Example 2](@ref example_SurrogateAssociationTest_ChatterjeeCorrelation). Testing the significance + of a Chatterjee-type association using a surrogate test. +""" +struct ChatterjeeCorrelation{XS, R, L, DR, YR, YSI, YSS, RNG} <: CorrelationMeasure + # Pre-allocated containers (if input data are given). + # If no input data are given, then these are all `nothing`. + xs_inds::XS # indices that would sort `x` + r::R # ranks + 𝓁::L # 𝓁s + Δr::DR # absolute value of first differences of ranks + y_sortedbyx::YR # ys, sorted according to the order that sorts the xs. + y_sortedbyx_inds::YSI # A container holding the indices that would sort the *second* input vector (Y). + y_sortedbyx_sorted::YSS # the sorted version of `y_sortedbyx` + rng::RNG + handle_ties::Bool + lt::Function +end + +# Non-preallocated version +function ChatterjeeCorrelation(; handle_ties = true, rng = Random.default_rng(), lt = (a, b) -> isless_rand(rng, a, b)) + return ChatterjeeCorrelation{Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(rng)}( + nothing, nothing, nothing, nothing, nothing, nothing, nothing, rng, handle_ties, lt) +end + +# Preallocated version +function ChatterjeeCorrelation(x, y; + handle_ties = true, rng = Random.default_rng(), lt = (a, b) -> isless_rand(rng, a, b)) + verify_inputs_chatterjee(x, y) + n = length(x) + xs_inds = zeros(Int, n) + y_sortedbyx = similar(y) # rearranged `y` (sorted according to `xs_inds`) + y_sortedbyx_inds = zeros(Int, n) # rearranged `y` (sorted according to `xs_inds`) + y_sortedbyx_sorted = similar(y) # sort the rearranged `y`. + + 𝓁 = zeros(Int, n) + r = zeros(Int, n) + Δr = zeros(n) + return ChatterjeeCorrelation(xs_inds, r, 𝓁, Δr, y_sortedbyx, y_sortedbyx_inds, + y_sortedbyx_sorted, rng, handle_ties, lt) +end + +# If the user provided a non-preallocated version, then we simply create a new version +# of the measure that *is* pre-allocated and use that. +function association(m::ChatterjeeCorrelation{Nothing}, x, y) + measure = ChatterjeeCorrelation(x, y, rng = m.rng, lt = m.lt, handle_ties = m.handle_ties) + return association(measure, x, y) +end + +# Pre-allocated version +function association(m::ChatterjeeCorrelation{<:AbstractVector}, x, y) + verify_inputs_chatterjee(x, y) + n = length(x) + + # Rearrange input data according to paper. + m.xs_inds .= sortperm!(m.xs_inds, x, lt = m.lt) + m.y_sortedbyx .= y[m.xs_inds] + + # Sort the rearranged `y`s so that we can more efficiently find the ranks. + m.y_sortedbyx_inds .= sortperm!(m.y_sortedbyx_inds, m.y_sortedbyx) + m.y_sortedbyx_sorted .= m.y_sortedbyx[m.y_sortedbyx_inds] + + # If we explicitly deal with ties, then we need to find both the ranks `r`s and + # the `𝓁`s that go into the denominator of the final expression. If we don't deal + # with ties, then we only need to find the ranks `r`, which save some time. + if m.handle_ties + loopcount_rs_ℓs!(m.r, m.𝓁, m.y_sortedbyx_sorted, m.y_sortedbyx); + else + loopcount_rs!(m.r, m.y_sortedbyx_sorted, m.y_sortedbyx) + end + + # First differences of the ranks, stored in the pre-allocated `Δr`. + absfirstdiffs!(m.Δr, m.r) + + return chatterjee_coefficient(m.Δr, m.𝓁, n, m.handle_ties) +end + +function loopcount_rs!(r, sorted_y_sortedbyx, y_sortedbyx) + for (i, yᵢ) in enumerate(y_sortedbyx) + # The position of the last (rightmost) occurrence of yᵢ in the + # sorted data is the rank. + rightmost_occurrence = searchsortedlast(sorted_y_sortedbyx, yᵢ) + r[i] = rightmost_occurrence + end +end + +function loopcount_rs_ℓs!(r, ℓ, sorted_y_sortedbyx, y_sortedbyx) + n = length(y_sortedbyx) + + for (i, yᵢ) in enumerate(y_sortedbyx) + # Indices of the first and last occurrences of `yᵢ` in the sorted data. + leftmost_occurrence::Int = searchsortedfirst(sorted_y_sortedbyx, yᵢ) + rightmost_occurrence::Int = searchsortedlast(sorted_y_sortedbyx, yᵢ) + + # rᵢ (rank of yᵢ): count of elements less than or equal to yᵢ + r[i] = rightmost_occurrence + + # ℓ: count of elements greater than or equal to yᵢ + ℓ[i] = n - leftmost_occurrence + 1 + end + + return r, ℓ +end + +""" + chatterjee_coefficient(Δr, 𝓁, n::Integer, handle_ties::Bool) + +Compute the Chatterjee correlation coefficient given the length of the +input `n`, pre-computed first differences of the ranks `Δr` and pre-computed +`𝓁`. +""" +function chatterjee_coefficient(Δr, 𝓁, n::Integer, handle_ties::Bool) + if handle_ties + num = n*sum(Δr) + den = 2*sum(𝓁ᵢ * (n - 𝓁ᵢ) for 𝓁ᵢ in 𝓁) + else + num = 3*sum(Δr) + den = (n^2 - 1) + end + return 1 - num / den +end + +""" + absfirstdiffs!(dx, x) + +Compute absolute values of the first differences between the elements of `x` into the +preallocated x::AbstractVectoray `dx` where `length(dx) == length(x)` (not checked). +""" +function absfirstdiffs!(dx, x) + n = length(x) + for i in 1:(n - 1) + dx[i] = abs(x[i + 1] - x[i]) + end + return dx +end + +function verify_inputs_chatterjee(x, y) + if length(x) != length(y) + msg = "`length(x)` must equal `length(y)` for `ChatterjeeCorrelation`. " * + "Got length(x)=$(length(x)) and length(y)=$(length(y))." + throw(ArgumentError(msg)) + end +end + +# break ties randomly. +function isless_rand(rng, a, b) + if a < b + true + elseif a > b + false + else + rand(rng, Bool) + end +end + diff --git a/src/methods/correlation/correlation.jl b/src/methods/correlation/correlation.jl index 61d92e64..9a6a23ac 100644 --- a/src/methods/correlation/correlation.jl +++ b/src/methods/correlation/correlation.jl @@ -10,6 +10,7 @@ The supertype for correlation measures. - [`PearsonCorrelation`](@ref) - [`PartialCorrelation`](@ref) - [`DistanceCorrelation`](@ref) +- [`ChatterjeeCorrelation`](@ref) """ abstract type CorrelationMeasure <: AssociationMeasure end @@ -20,3 +21,4 @@ abstract type CorrelationMeasureEstimator{M} <: AssociationMeasure end include("pearson_correlation.jl") include("partial_correlation.jl") include("distance_correlation.jl") +include("chatterjee.jl") \ No newline at end of file diff --git a/test/independence/SurrogateAssociationTest/SurrogateAssociationTest.jl b/test/independence/SurrogateAssociationTest/SurrogateAssociationTest.jl index ec0b6200..1911ce1b 100644 --- a/test/independence/SurrogateAssociationTest/SurrogateAssociationTest.jl +++ b/test/independence/SurrogateAssociationTest/SurrogateAssociationTest.jl @@ -17,6 +17,7 @@ include("MMeasure.jl") include("LMeasure.jl") include("TransferEntropyPairwise.jl") include("crossmappings.jl") +include("chatterjee_correlation.jl") # Conditional measures include("ConditionalMutualInformation.jl") diff --git a/test/independence/SurrogateAssociationTest/chatterjee_correlation.jl b/test/independence/SurrogateAssociationTest/chatterjee_correlation.jl new file mode 100644 index 00000000..39c9c02c --- /dev/null +++ b/test/independence/SurrogateAssociationTest/chatterjee_correlation.jl @@ -0,0 +1,18 @@ +using Test +using Random +rng = Xoshiro(1234) + +# We can use surrogate tests and p-values to further verify the correctness of the +# algorithm. +test = SurrogateAssociationTest(ChatterjeeCorrelation(), nshuffles = 19) + +# We expect that we *cannot* reject the null hypothesis for independent variables +x = rand(rng, 1:10, 100) +y = rand(rng, 1:10, 100) +α = 0.05 +@test pvalue(independence(test, x, y)) > α + +# We expect that we *can* reject the null hypothesis for for dependent variables. +w = rand(rng, 1:10, 100) +z = rand(rng, 1:10, 100) .* sin.(w) .+ cos.(w) +@test pvalue(independence(test, z, w)) < α diff --git a/test/methods/correlation/chatterjee_correlation.jl b/test/methods/correlation/chatterjee_correlation.jl new file mode 100644 index 00000000..8755d678 --- /dev/null +++ b/test/methods/correlation/chatterjee_correlation.jl @@ -0,0 +1,50 @@ +using Test +using Associations +using Random +rng = Xoshiro(12345) + +x = rand(rng, 1:5, 100) +y = rand(rng, 1:5, 101) +m = ChatterjeeCorrelation() +@test_throws ArgumentError association(m, x, y) +# ---------------------------------------------------------------- +# Implementation tests +# ---------------------------------------------------------------- + +# We should get exactly the same results for preallocated measure +# as for non-preallocated measure. +for i = 1:10 + x = rand(rng, 15) + y = rand(rng, 15) + + # We must initialize identical seeds to ensure reproducible results + rng_seed = rand(rng, 1:100) + m1 = ChatterjeeCorrelation(x, y, handle_ties = false, rng = Xoshiro(rng_seed)) + m2 = ChatterjeeCorrelation(handle_ties = false, rng = Xoshiro(1234)) + c1 = association(m1, x, y) + c2 = association(m2, x, y) + @test c1 == c2 +end +# ---------------------------------------------------------------- +# "analytical" test by comparing to output from the XICOR R-package by +# the method author. +# ---------------------------------------------------------------- + +# Without repetitions +x = [1.2, 5.3, 2.4, 3.3, 7.7, 6.5] +y = [5.6, 6.6, 3.3, 4.4, 2.2, 15.0] +@test round(association(ChatterjeeCorrelation(x, y, handle_ties = false), x, y), digits = 6) ≈ 0.057143 +@test round(association(ChatterjeeCorrelation(x, y, handle_ties = true), x, y), digits = 6) ≈ 0.057143 +@test round(association(ChatterjeeCorrelation(handle_ties = false), x, y), digits = 6) ≈ 0.057143 +@test round(association(ChatterjeeCorrelation(handle_ties = false), x, y), digits = 6) ≈ 0.057143 + +# With repetitions. We have no way of comparing directly with the XICORE +# package, because there is no option to specify the random number seed +# for splitting ties. We'll just use some data where the coefficient +# should be positive. +x = [1.2, 5.3, 5.3, 2.4, 3.3, 7.7, 7.7, 6.5, 2.2] +y = [5.6, 6.6, 3.3, 4.4, 4.4, 2.2, 2.2, 15.0, 2.3] .+ x +@test round(association(ChatterjeeCorrelation(x, y, handle_ties = false), x, y), digits = 6) > 0 +@test round(association(ChatterjeeCorrelation(x, y, handle_ties = true), x, y), digits = 6) > 0 +@test round(association(ChatterjeeCorrelation(handle_ties = false), x, y), digits = 6) > 0 +@test round(association(ChatterjeeCorrelation(handle_ties = true), x, y), digits = 6) > 0 diff --git a/test/methods/correlation/correlation.jl b/test/methods/correlation/correlation.jl index a835ff39..19270028 100644 --- a/test/methods/correlation/correlation.jl +++ b/test/methods/correlation/correlation.jl @@ -1,3 +1,4 @@ include("pearson_correlation.jl") include("partial_correlation.jl") include("distance_correlation.jl") +include("chatterjee_correlation.jl")