Skip to content

Commit

Permalink
Merge pull request #18 from camilogarciabotero/dev
Browse files Browse the repository at this point in the history
Improve BCM struct correctness and other features
  • Loading branch information
camilogarciabotero authored Nov 7, 2023
2 parents 8feb371 + ac57249 commit 6437d09
Show file tree
Hide file tree
Showing 8 changed files with 151 additions and 75 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ version = "0.7.0"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
VectorizedKmers = "2ef45bd6-4c2b-43d8-88b3-40597287359a"
TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"

[weakdeps]
DiscreteMarkovChains = "8abcb7ef-b365-4f7b-ac38-56893fb62f9f"
Expand All @@ -32,8 +32,8 @@ julia = "1"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"

[targets]
test = ["Test"]
4 changes: 3 additions & 1 deletion src/BioMarkovChains.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ include("utils.jl")
export randbmc

include("transitions.jl")
export initials, transition_count_matrix, transition_probability_matrix, logoddsratio, logoddsratioscore
export initials, transition_count_matrix, transition_probability_matrix,
odds_ratio_matrix, log_odds_ratio_matrix, log_odds_ratio_score,
dnaseqprobability

include("models.jl")
export ECOLICDS, ECOLINOCDS
Expand Down
6 changes: 0 additions & 6 deletions src/constants.jl

This file was deleted.

2 changes: 1 addition & 1 deletion src/extended.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,4 @@ function StatsAPI.fit!(bmc::BMC, inits::Vector{Float64}, tpm::Matrix{Float64})
bmc.tpm .= tpm
foreach(sum_to_one!, eachrow(bmc.tpm))
return nothing
end
end
9 changes: 5 additions & 4 deletions src/perronfrobenius.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
### MarkdovChainHammer.jl ###

"""
perronfrobenius(sequence::NucleicSeqOrView{A}, n::Int64=1) where A
perronfrobenius(sequence::SeqOrView{A}, n::Int64=1) where A
Compute the Perron-Frobenius matrix, a column-wise version of the transition probability matrix (TPM), for a given nucleotide sequence.
Compute the Perron-Frobenius matrix, a column-stochastic version of the transition probability matrix (TPM), for a given nucleotide sequence.
The Perron-Frobenius matrix captures the asymptotic probabilities of transitioning between nucleotides in the sequence over a specified number of steps `n`. It provides insight into the long-term behavior of a Markov chain or a dynamical system associated with the sequence.
# Arguments
- `sequence::NucleicSeqOrView{A}`: A nucleotide sequence represented as a `NucleicSeqOrView{A}` object.
- `sequence::SeqOrView{A}`: A nucleotide sequence represented as a `NucleicSeqOrView{A}` object.
- `n::Int64=1`: The number of steps to consider for the transition probability matrix. Default is 1.
# Returns
Expand All @@ -19,8 +19,9 @@ A copy of the Perron-Frobenius matrix. Each column of this matrix corresponds to
sequence = LongSequence{DNAAlphabet{4}}("ACGTCGTCCACTACGACATCAGC") # Replace with an actual nucleotide sequence
n = 2
pf = perronfrobenius(sequence, n)
```
"""
function perronfrobenius(sequence::NucleicSeqOrView{A}; n::Int64=1) where A
function perronfrobenius(sequence::SeqOrView{A}; n::Int64=1) where A
tpm = transition_probability_matrix(sequence, n)
return copy(tpm')
end
Expand Down
133 changes: 109 additions & 24 deletions src/transitions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,6 @@ function transition_count_matrix(sequence::LongSequence{<:AminoAcidAlphabet})
return copy(counts)
end

# function transition_count_matrix(sequence::LongAminoAcidOrView)
# matrix = [(i,j) for i in AA20, j in AA20]
# trans = transitions(sequence)
# return reshape([get(trans, t, 0) for t in matrix], size(matrix))
# end

@doc raw"""
transition_probability_matrix(sequence::LongSequence{DNAAlphabet{4}}, n::Int64=1)
Expand Down Expand Up @@ -102,6 +96,7 @@ function transition_probability_matrix(sequence::SeqOrView{A}, n::Int64 = 1) whe

freqs[isnan.(freqs) .| isinf.(freqs)] .= 0.0 # Handling NaN and Inf

@assert round.(sum(freqs, dims=2)') == [1.0 1.0 1.0 1.0] "The transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."
return n > 1 ? freqs^n : freqs
end

Expand All @@ -113,11 +108,6 @@ transition_probability_matrix(bmc::BioMarkovChain) = bmc.tpm
tpm01 = transition_probability_matrix(seq01)

@test round.(tpm01, digits = 3) == [0.0 1.0 0.0 0.0; 0.0 0.5 0.2 0.3; 0.25 0.125 0.625 0.0; 0.0 0.667 0.333 0.0]

# Handling NaN and Inf
seq02 = dna"CCTCCCGGCCCTGGGCTCGGGC"
tpm02 = transition_probability_matrix(seq02)
@test round.(tpm02, digits = 3) == [0.0 0.0 0.0 0.0; 0.0 0.5 0.2 0.3; 0.0 0.375 0.625 0.0; 0.0 0.667 0.333 0.0]
end

@doc raw"""
Expand Down Expand Up @@ -159,8 +149,17 @@ initials(bmc::BioMarkovChain) = bmc.inits

# findfirst(i -> i == (AA_T, AA_R), aamatrix)

function odds_ratio_matrix(
sequence::SeqOrView{A},
model::BioMarkovChain;
) where A
@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
tpm = transition_probability_matrix(sequence)
return tpm ./ model.tpm
end

"""
logoddsratio(sequence::NucleicSeqOrView{A}, model::BioMarkovChain) where A
log_odds_ratio_matrix(sequence::NucleicSeqOrView{A}, model::BioMarkovChain) where A
Calculates the log-odds ratio between the transition probability matrix of a given DNA sequence and a reference model.
Expand All @@ -175,21 +174,24 @@ Calculates the log-odds ratio between the transition probability matrix of a giv
```julia
sequence = LongNucOrView{4}("ACGT")
model = BioMarkovChain(sequence) # Provide appropriate initialization for BioMarkovChain
result = logoddsratio(sequence, model)
result = log_odds_ratio_matrix(sequence, model)
```
"""
function logoddsratio(
sequence::NucleicSeqOrView{A},
function log_odds_ratio_matrix(
sequence::SeqOrView{A},
model::BioMarkovChain;
b::Number =
) where A

@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
tpm = transition_probability_matrix(sequence)

return log.(b, tpm./model.tpm)
lorm = log.(b, tpm./model.tpm)
lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
return lorm
end

"""
logoddsratio(model1::BioMarkovChain, model2::BioMarkovChain)
log_odds_ratio_matrix(model1::BioMarkovChain, model2::BioMarkovChain)
Calculates the log-odds ratio between the transition probability matrices of two BioMarkovChain models.
Expand All @@ -199,20 +201,103 @@ Calculates the log-odds ratio between the transition probability matrices of two
- `model2::BioMarkovChain`: The second BioMarkovChain model.
"""
function logoddsratio(
function log_odds_ratio_matrix(
model1::BioMarkovChain,
model2::BioMarkovChain;
b::Number =
)
return log.(b, model1.tpm ./ model2.tpm)
@assert model1.statespace == model2.statespace "Models state spaces are inconsistent"
lorm = log.(b, model1.tpm ./ model2.tpm)
lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
return lorm
end

function logoddsratioscore(
sequence::NucleicSeqOrView{A},
"""
log_odds_ratio_score(sequence::SeqOrView{A}, model::BioMarkovChain; b::Number = ℯ)
Compute the log odds ratio score between a given sequence and a BioMarkovChain model.
# Arguments
- `sequence::SeqOrView{A}`: A sequence of elements of type `A`.
- `model::BioMarkovChain`: A BioMarkovChain model.
- `b::Number = ℯ`: The base of the logarithm used to compute the log odds ratio.
# Returns
The log odds ratio score between the sequence and the model.
# Example
"""
function log_odds_ratio_score(
sequence::SeqOrView{A},
model::BioMarkovChain;
b::Number =
) where A
) where A
@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
tpm = transition_probability_matrix(sequence)
lorm = log.(b, tpm ./ model.tpm)
lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
return sum(lorm)/length(sequence)
end

@doc raw"""
sequenceprobability(sequence::LongNucOrView{4}, model::BioMarkovChain)
Compute the probability of a given sequence using a transition probability matrix and the initial probabilities distributions of a `BioMarkovModel`.
```math
P(X_1 = i_1, \ldots, X_T = i_T) = \pi_{i_1}^{T-1} \prod_{t=1}^{T-1} a_{i_t, i_{t+1}}
```
# Arguments
- `sequence::LongNucOrView{4}`: The input sequence of nucleotides.
- `tm::BioMarkovChain` is the actual data structure composed of a `tpm::Matrix{Float64}` the transition probability matrix and `initials=Vector{Float64}` the initial state probabilities.
# Returns
- `probability::Float64`: The probability of the input sequence.
# Example
```
mainseq = LongDNA{4}("CCTCCCGGACCCTGGGCTCGGGAC")
bmc = BioMarkovChain(mainseq)
BioMarkovChain with DNA Alphabet:
- Transition Probability Matrix -> Matrix{Float64}(4 × 4):
0.0 1.0 0.0 0.0
0.0 0.5 0.2 0.3
0.25 0.125 0.625 0.0
0.0 0.6667 0.3333 0.0
- Initial Probabilities -> Vector{Float64}(4 × 1):
0.087
0.4348
0.3478
0.1304
- Markov Chain Order -> Int64:
1
newseq = LongDNA{4}("CCTG")
4nt DNA Sequence:
CCTG
dnaseqprobability(newseq, bmc)
0.0217
```
"""
function dnaseqprobability(
sequence::NucleicSeqOrView{A},
model::BioMarkovChain
) where A

init = model.inits[_dna_to_int(sequence[1])]

probability = init

return sum(log.(b, tpm./model.tpm)) / length(sequence)
for t in 1:length(sequence)-1
probability *= model.tpm[_dna_to_int(sequence[t]), _dna_to_int(sequence[t+1])]
end
return probability
end
21 changes: 7 additions & 14 deletions src/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,16 @@ struct BioMarkovChain{S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Inte
tpm::M # The probabilities of the TransitionProbabilityMatrix struct
inits::I # the initials distribution of probabilities
n::N # The order of the Markov chain
function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer}
bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n)
return bmc
function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer}
bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n)
return bmc
end

function BioMarkovChain(sequence::SeqOrView{A}, n::Int64=1) where A
inits = Array{Float64, 1}(undef, 1)
tcm = transition_count_matrix(sequence)
inits = vec(sum(tcm, dims = 1) ./ sum(tcm))

rowsums = sum(tcm, dims = 2)
freqs = tcm ./ rowsums

freqs[isnan.(freqs) .| isinf.(freqs)] .= 0.0 # Handling NaN and Inf

bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? freqs^n : freqs, inits, n)
return bmc
inits = initials(sequence)
tpm = transition_probability_matrix(sequence)
bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? tpm^n : tpm, inits, n)
return bmc
end
end

Expand Down
47 changes: 24 additions & 23 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -1,31 +1,32 @@
function _int_to_dna(index::Int64; extended_alphabet::Bool = false)
A = extended_alphabet ? [DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N] : [DNA_A, DNA_C, DNA_G, DNA_T]
return LongSequence{DNAAlphabet{4}}([A[index]])
function _int_to_dna(index::Int64)
# A = extended_alphabet ? alphabet(DNA) : ACGT
modifier(value) = (value == 3) ? 4 : (value == 4) ? 8 : value
return reinterpret.(DNA, Int8(modifier(index))) #LongSequence{DNAAlphabet{4}}([A[index]]) # reinterpret.(DNA, Int8(1)) == A, reinterpret.(Int8, DNA_A) = 1
end

function _dna_to_int(nucleotide::DNA; extended_alphabet::Bool = false)
A = extended_alphabet ? [DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N] : [DNA_A, DNA_C, DNA_G, DNA_T]
return findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A))
function _dna_to_int(nucleotide::DNA)
# A = extended_alphabet ? [DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N] : [DNA_A, DNA_C, DNA_G, DNA_T]
modifier(value) = (value == DNA_G) ? DNA_M : (value == DNA_T) ? DNA_G : value
return reinterpret.(Int8, modifier(nucleotide))[1] #searchsortedfirst(A, nucleotide) # findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A))
end

function randbmc(statespace::DataType, n::Int64=1)
if statespace == DNA || statespace == RNA
tpm = rand(4, 4)
row_sums = sum(tpm, dims=2)
inits = rand(4)
init_sum = sum(inits)
@views tpm ./= row_sums
@views inits ./= init_sum
elseif statespace == AminoAcid
tpm = rand(20, 20)
row_sums = sum(tpm, dims=2)
inits = rand(20)
init_sum = sum(inits)
@views tpm ./= row_sums
@views inits ./= init_sum
else
error("Alphabet must be of the DNA, RNA or AminoAcid DataType")

if !(statespace in (DNA, RNA, AminoAcid))
throw(ArgumentError("Alphabet must be of the DNA, RNA, or AminoAcid DataType."))
end

nstates = (statespace == AminoAcid) ? 20 : 4
tpm = rand(nstates, nstates)

# Normalize rows of the transition probability matrix
rowsums = sum(tpm, dims=2)
@views tpm ./= rowsums

# Generate random initial probabilities and normalize them
inits = rand(nstates)
initsum = sum(inits)
@views inits ./= initsum

return BMC(statespace, tpm, inits, n)
end
end

0 comments on commit 6437d09

Please sign in to comment.