Merge pull request #18 from camilogarciabotero/dev

Improve BCM struct correctness and other features
BioJulia · Nov 7, 2023 · 6437d09 · 6437d09
2 parents 8feb371 + ac57249
commit 6437d09
Show file tree

Hide file tree

Showing 8 changed files with 151 additions and 75 deletions.
diff --git a/Project.toml b/Project.toml
@@ -7,9 +7,9 @@ version = "0.7.0"
 BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
+TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
 VectorizedKmers = "2ef45bd6-4c2b-43d8-88b3-40597287359a"
-TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 
 [weakdeps]
 DiscreteMarkovChains = "8abcb7ef-b365-4f7b-ac38-56893fb62f9f"
@@ -32,8 +32,8 @@ julia = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
 TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
+TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
 
 [targets]
 test = ["Test"]
diff --git a/src/BioMarkovChains.jl b/src/BioMarkovChains.jl
@@ -36,7 +36,9 @@ include("utils.jl")
 export randbmc
 
 include("transitions.jl")
-export initials, transition_count_matrix, transition_probability_matrix, logoddsratio, logoddsratioscore
+export initials, transition_count_matrix, transition_probability_matrix, 
+odds_ratio_matrix, log_odds_ratio_matrix, log_odds_ratio_score, 
+dnaseqprobability
 
 include("models.jl")
 export ECOLICDS, ECOLINOCDS

diff --git a/src/constants.jl b/src/constants.jl
diff --git a/src/extended.jl b/src/extended.jl
@@ -62,4 +62,4 @@ function StatsAPI.fit!(bmc::BMC, inits::Vector{Float64}, tpm::Matrix{Float64})
     bmc.tpm .= tpm
     foreach(sum_to_one!, eachrow(bmc.tpm))
     return nothing
-end
+end
diff --git a/src/perronfrobenius.jl b/src/perronfrobenius.jl
@@ -1,14 +1,14 @@
 ### MarkdovChainHammer.jl ###
 
 """
-    perronfrobenius(sequence::NucleicSeqOrView{A}, n::Int64=1) where A
+    perronfrobenius(sequence::SeqOrView{A}, n::Int64=1) where A
 
-Compute the Perron-Frobenius matrix, a column-wise version of the transition probability matrix (TPM), for a given nucleotide sequence.
+Compute the Perron-Frobenius matrix, a column-stochastic version of the transition probability matrix (TPM), for a given nucleotide sequence.
 
 The Perron-Frobenius matrix captures the asymptotic probabilities of transitioning between nucleotides in the sequence over a specified number of steps `n`. It provides insight into the long-term behavior of a Markov chain or a dynamical system associated with the sequence.
 
 # Arguments
-- `sequence::NucleicSeqOrView{A}`: A nucleotide sequence represented as a `NucleicSeqOrView{A}` object.
+- `sequence::SeqOrView{A}`: A nucleotide sequence represented as a `NucleicSeqOrView{A}` object.
 - `n::Int64=1`: The number of steps to consider for the transition probability matrix. Default is 1.
 
 # Returns
@@ -19,8 +19,9 @@ A copy of the Perron-Frobenius matrix. Each column of this matrix corresponds to
 sequence = LongSequence{DNAAlphabet{4}}("ACGTCGTCCACTACGACATCAGC")  # Replace with an actual nucleotide sequence
 n = 2
 pf = perronfrobenius(sequence, n)
+```
 """
-function perronfrobenius(sequence::NucleicSeqOrView{A}; n::Int64=1) where A
+function perronfrobenius(sequence::SeqOrView{A}; n::Int64=1) where A
     tpm = transition_probability_matrix(sequence, n)
     return copy(tpm')
 end

diff --git a/src/transitions.jl b/src/transitions.jl
@@ -32,12 +32,6 @@ function transition_count_matrix(sequence::LongSequence{<:AminoAcidAlphabet})
     return copy(counts)
 end
 
-# function transition_count_matrix(sequence::LongAminoAcidOrView)
-#     matrix = [(i,j) for i in AA20, j in AA20]
-#     trans = transitions(sequence)
-#     return reshape([get(trans, t, 0) for t in matrix], size(matrix))
-# end
-
 @doc raw"""
     transition_probability_matrix(sequence::LongSequence{DNAAlphabet{4}}, n::Int64=1)
 
@@ -102,6 +96,7 @@ function transition_probability_matrix(sequence::SeqOrView{A}, n::Int64 = 1) whe
 
     freqs[isnan.(freqs) .| isinf.(freqs)] .= 0.0 # Handling NaN and Inf
 
+    @assert round.(sum(freqs, dims=2)') == [1.0 1.0 1.0 1.0] "The transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
     return n > 1 ? freqs^n : freqs
 end
 
@@ -113,11 +108,6 @@ transition_probability_matrix(bmc::BioMarkovChain) = bmc.tpm
     tpm01 = transition_probability_matrix(seq01)
 
     @test round.(tpm01, digits = 3) == [0.0 1.0 0.0 0.0; 0.0 0.5 0.2 0.3; 0.25 0.125 0.625 0.0; 0.0 0.667 0.333 0.0]
-
-    # Handling NaN and Inf
-    seq02 = dna"CCTCCCGGCCCTGGGCTCGGGC"
-    tpm02 = transition_probability_matrix(seq02)
-    @test round.(tpm02, digits = 3) == [0.0 0.0 0.0 0.0; 0.0 0.5 0.2 0.3; 0.0 0.375 0.625 0.0; 0.0 0.667 0.333 0.0]
 end
 
 @doc raw"""
@@ -159,8 +149,17 @@ initials(bmc::BioMarkovChain) = bmc.inits
 
 # findfirst(i -> i == (AA_T, AA_R), aamatrix)
 
+function odds_ratio_matrix(
+    sequence::SeqOrView{A},
+    model::BioMarkovChain;
+) where A
+    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    tpm = transition_probability_matrix(sequence)
+    return tpm ./ model.tpm
+end
+
 """
-    logoddsratio(sequence::NucleicSeqOrView{A}, model::BioMarkovChain) where A
+    log_odds_ratio_matrix(sequence::NucleicSeqOrView{A}, model::BioMarkovChain) where A
 
 Calculates the log-odds ratio between the transition probability matrix of a given DNA sequence and a reference model.
 
@@ -175,21 +174,24 @@ Calculates the log-odds ratio between the transition probability matrix of a giv
 ```julia
 sequence = LongNucOrView{4}("ACGT")
 model = BioMarkovChain(sequence)  # Provide appropriate initialization for BioMarkovChain
-result = logoddsratio(sequence, model)
+result = log_odds_ratio_matrix(sequence, model)
 ```
 """
-function logoddsratio(
-    sequence::NucleicSeqOrView{A},
+function log_odds_ratio_matrix(
+    sequence::SeqOrView{A},
     model::BioMarkovChain;
     b::Number = ℯ
 ) where A
+
+    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
     tpm = transition_probability_matrix(sequence)
-
-    return log.(b, tpm./model.tpm) 
+    lorm = log.(b, tpm./model.tpm)
+    lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
+    return lorm
 end
 
 """
-logoddsratio(model1::BioMarkovChain, model2::BioMarkovChain)
+    log_odds_ratio_matrix(model1::BioMarkovChain, model2::BioMarkovChain)
 
 Calculates the log-odds ratio between the transition probability matrices of two BioMarkovChain models.
 
@@ -199,20 +201,103 @@ Calculates the log-odds ratio between the transition probability matrices of two
 - `model2::BioMarkovChain`: The second BioMarkovChain model.
 
 """
-function logoddsratio(
+function log_odds_ratio_matrix(
     model1::BioMarkovChain,
     model2::BioMarkovChain;
     b::Number = ℯ
 )
-    return log.(b, model1.tpm ./ model2.tpm) 
+    @assert model1.statespace == model2.statespace "Models state spaces are inconsistent"
+    lorm = log.(b, model1.tpm ./ model2.tpm)
+    lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
+    return lorm
 end
 
-function logoddsratioscore(
-    sequence::NucleicSeqOrView{A},
+"""
+    log_odds_ratio_score(sequence::SeqOrView{A}, model::BioMarkovChain; b::Number = ℯ)
+
+Compute the log odds ratio score between a given sequence and a BioMarkovChain model.
+
+# Arguments
+- `sequence::SeqOrView{A}`: A sequence of elements of type `A`.
+- `model::BioMarkovChain`: A BioMarkovChain model.
+- `b::Number = ℯ`: The base of the logarithm used to compute the log odds ratio.
+
+# Returns
+The log odds ratio score between the sequence and the model.
+
+# Example
+"""
+function log_odds_ratio_score(
+    sequence::SeqOrView{A},
     model::BioMarkovChain;
     b::Number = ℯ
-) where A
+) where A 
+    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
     tpm = transition_probability_matrix(sequence)
+    lorm = log.(b, tpm ./ model.tpm)
+    lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
+    return sum(lorm)/length(sequence)
+end
+
+@doc raw"""
+    sequenceprobability(sequence::LongNucOrView{4}, model::BioMarkovChain)
+
+Compute the probability of a given sequence using a transition probability matrix and the initial probabilities distributions of a `BioMarkovModel`.
+
+```math
+P(X_1 = i_1, \ldots, X_T = i_T) = \pi_{i_1}^{T-1} \prod_{t=1}^{T-1} a_{i_t, i_{t+1}}
+```
+
+# Arguments
+- `sequence::LongNucOrView{4}`: The input sequence of nucleotides.
+- `tm::BioMarkovChain` is the actual data structure composed of a `tpm::Matrix{Float64}` the transition probability matrix and `initials=Vector{Float64}` the initial state probabilities.
+
+# Returns
+- `probability::Float64`: The probability of the input sequence.
+
+# Example
+
+```
+mainseq = LongDNA{4}("CCTCCCGGACCCTGGGCTCGGGAC")
+   
+bmc = BioMarkovChain(mainseq)
+
+BioMarkovChain with DNA Alphabet:
+  - Transition Probability Matrix -> Matrix{Float64}(4 × 4):
+   0.0     1.0     0.0     0.0
+   0.0     0.5     0.2     0.3
+   0.25    0.125   0.625   0.0
+   0.0     0.6667  0.3333  0.0
+  - Initial Probabilities -> Vector{Float64}(4 × 1):
+   0.087
+   0.4348
+   0.3478
+   0.1304
+  - Markov Chain Order -> Int64:
+   1
+
+newseq = LongDNA{4}("CCTG")
+
+    4nt DNA Sequence:
+    CCTG
+
+
+dnaseqprobability(newseq, bmc)
+    
+    0.0217
+```
+"""
+function dnaseqprobability(
+    sequence::NucleicSeqOrView{A},
+    model::BioMarkovChain
+) where A
+
+    init = model.inits[_dna_to_int(sequence[1])]
+
+    probability = init
 
-    return sum(log.(b, tpm./model.tpm)) / length(sequence)
+    for t in 1:length(sequence)-1
+        probability *= model.tpm[_dna_to_int(sequence[t]), _dna_to_int(sequence[t+1])]
+    end
+    return probability
 end
diff --git a/src/types.jl b/src/types.jl
@@ -39,23 +39,16 @@ struct BioMarkovChain{S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Inte
   tpm::M # The probabilities of the TransitionProbabilityMatrix struct
   inits::I # the initials distribution of probabilities
   n::N # The order of the Markov chain
-  function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer}
-      bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n)
-      return bmc
+  function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} 
+    bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n)
+    return bmc
   end
 
   function BioMarkovChain(sequence::SeqOrView{A}, n::Int64=1) where A
-      inits = Array{Float64, 1}(undef, 1)
-      tcm = transition_count_matrix(sequence)
-      inits = vec(sum(tcm, dims = 1) ./ sum(tcm))
-
-      rowsums = sum(tcm, dims = 2)
-      freqs = tcm ./ rowsums
-
-      freqs[isnan.(freqs) .| isinf.(freqs)] .= 0.0 # Handling NaN and Inf
-
-      bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? freqs^n : freqs, inits, n)
-      return bmc
+    inits = initials(sequence)
+    tpm = transition_probability_matrix(sequence)
+    bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? tpm^n : tpm, inits, n)
+    return bmc
   end
 end
 

diff --git a/src/utils.jl b/src/utils.jl
@@ -1,31 +1,32 @@
-function _int_to_dna(index::Int64; extended_alphabet::Bool = false)
-    A = extended_alphabet ? [DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N] : [DNA_A, DNA_C, DNA_G, DNA_T]
-    return LongSequence{DNAAlphabet{4}}([A[index]])
+function _int_to_dna(index::Int64)
+    # A = extended_alphabet ? alphabet(DNA) : ACGT
+    modifier(value) = (value == 3) ? 4 : (value == 4) ? 8 : value
+    return reinterpret.(DNA, Int8(modifier(index))) #LongSequence{DNAAlphabet{4}}([A[index]]) # reinterpret.(DNA, Int8(1)) == A, reinterpret.(Int8, DNA_A) = 1
 end
 
-function _dna_to_int(nucleotide::DNA; extended_alphabet::Bool = false)
-    A = extended_alphabet ? [DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N] : [DNA_A, DNA_C, DNA_G, DNA_T]
-    return findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A))
+function _dna_to_int(nucleotide::DNA)
+    # A = extended_alphabet ? [DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N] : [DNA_A, DNA_C, DNA_G, DNA_T]
+    modifier(value) = (value == DNA_G) ? DNA_M : (value == DNA_T) ? DNA_G : value
+    return reinterpret.(Int8, modifier(nucleotide))[1]  #searchsortedfirst(A, nucleotide) # findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A))
 end
 
 function randbmc(statespace::DataType, n::Int64=1)
-    if statespace == DNA || statespace == RNA
-        tpm = rand(4, 4)
-        row_sums = sum(tpm, dims=2)
-        inits = rand(4)
-        init_sum = sum(inits)
-        @views tpm ./= row_sums
-        @views inits ./= init_sum
-    elseif statespace == AminoAcid
-        tpm = rand(20, 20)
-        row_sums = sum(tpm, dims=2)
-        inits = rand(20)
-        init_sum = sum(inits)
-        @views tpm ./= row_sums
-        @views inits ./= init_sum
-    else
-        error("Alphabet must be of the DNA, RNA or AminoAcid DataType")
+
+    if !(statespace in (DNA, RNA, AminoAcid))
+        throw(ArgumentError("Alphabet must be of the DNA, RNA, or AminoAcid DataType."))
     end
 
+    nstates = (statespace == AminoAcid) ? 20 : 4
+    tpm = rand(nstates, nstates)
+
+    # Normalize rows of the transition probability matrix
+    rowsums = sum(tpm, dims=2)
+    @views tpm ./= rowsums
+
+    # Generate random initial probabilities and normalize them
+    inits = rand(nstates)
+    initsum = sum(inits)
+    @views inits ./= initsum
+
     return BMC(statespace, tpm, inits, n)
-end
+end