Skip to content

Commit

Permalink
Merge pull request #20 from camilogarciabotero:dev
Browse files Browse the repository at this point in the history
Refactor `tcm` to work with AAs again
  • Loading branch information
camilogarciabotero authored Nov 17, 2023
2 parents 869b9aa + b6b7063 commit 3ed2876
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ PrecompileTools = "1"
StatsAPI = "1.6"
TestItemRunner = "0.2"
TestItems = "0.1"
VectorizedKmers = "0.7"
VectorizedKmers = "0"
julia = "1"

[extras]
Expand Down
2 changes: 1 addition & 1 deletion src/BioMarkovChains.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ odds_ratio_matrix, log_odds_ratio_matrix, log_odds_ratio_score,
dnaseqprobability

include("models.jl")
export ECOLICDS, ECOLINOCDS
export ECOLICDS, ECOLINOCDS, CPGPOS, CPGNEG

include("perronfrobenius.jl")
export perronfrobenius # generatedna
Expand Down
48 changes: 39 additions & 9 deletions src/transitions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ function transition_count_matrix(sequence::NucleicSeqOrView{A}) where A
return copy(counts)
end

function transition_count_matrix(sequence::LongSequence{<:AminoAcidAlphabet})
function transition_count_matrix(sequence::SeqOrView{<:AminoAcidAlphabet})
counts = reshape(count_kmers(sequence, 2), (20,20))'
return copy(counts)
end
Expand Down Expand Up @@ -96,7 +96,6 @@ function transition_probability_matrix(sequence::SeqOrView{A}, n::Int64 = 1) whe

freqs[isnan.(freqs) .| isinf.(freqs)] .= 0.0 # Handling NaN and Inf

@assert round.(sum(freqs, dims=2)') == [1.0 1.0 1.0 1.0] "The transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."
return n > 1 ? freqs^n : freqs
end

Expand All @@ -106,8 +105,11 @@ transition_probability_matrix(bmc::BioMarkovChain) = bmc.tpm
using BioSequences, BioMarkovChains
seq01 = dna"CCTCCCGGACCCTGGGCTCGGGAC"
tpm01 = transition_probability_matrix(seq01)

@test round.(tpm01, digits = 3) == [0.0 1.0 0.0 0.0; 0.0 0.5 0.2 0.3; 0.25 0.125 0.625 0.0; 0.0 0.667 0.333 0.0]

seq02 = aa"ACDEFGHIKLMNPQRSTVWY"
tpm02 = transition_probability_matrix(seq02)
@test round.(tpm02, digits = 3) == [0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0]
end

@doc raw"""
Expand Down Expand Up @@ -182,19 +184,27 @@ function log_odds_ratio_matrix(
model::BioMarkovChain;
b::Number =
) where A

@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
@assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."

tpm = transition_probability_matrix(sequence)
@assert round.(sum(tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Sequence transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."

lorm = log.(b, tpm./model.tpm)
lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
return lorm
end

"""
@doc raw"""
log_odds_ratio_matrix(model1::BioMarkovChain, model2::BioMarkovChain)
Calculates the log-odds ratio between the transition probability matrices of two BioMarkovChain models.
```math
\beta = \log \left( \frac{P(x|\mathscr{m}_{1})}{P(x|\text{\mathscr{m}_{2})} \right)
```
Where $\mathscr{m}_{1}$ and $\mathscr{m}_{2}$ are the two models transition probability matrices.
# Arguments
- `model1::BioMarkovChain`: The first BioMarkovChain model.
Expand All @@ -207,16 +217,31 @@ function log_odds_ratio_matrix(
b::Number =
)
@assert model1.statespace == model2.statespace "Models state spaces are inconsistent"
@assert round.(sum(model1.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 1 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."
@assert round.(sum(model2.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 2 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."

lorm = log.(b, model1.tpm ./ model2.tpm)
lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
return lorm
end

"""
@testitem "lorm" begin
using BioSequences, BioMarkovChains

lorm01 = log_odds_ratio_matrix(CPGPOS, CPGNEG, b=2) # Extracted from the Biological Sequence Analysis: probabilistic models of proteins and nucleic acids, Durbin et al. 1998
lorm01 == [-0.7369655941662062 0.4185519834550808 0.5798915111737342 -0.8073549220576043; -0.913064363228719 0.3043934355948513 1.8126298640982785 -0.6838158876474414; -0.6232794322722583 0.4626269577971041 0.3315782649210818 -0.7346554334790053; -1.1638248019058943 0.5708084064112958 0.3951379418411391 -0.6820299186813209]

end

@doc raw"""
log_odds_ratio_score(sequence::SeqOrView{A}, model::BioMarkovChain; b::Number = ℯ)
Compute the log odds ratio score between a given sequence and a BioMarkovChain model.
```math
S(x) = \sum_{i=1}^{L} \beta_{x_{i}x} = \sum_{i=1} \log \frac{a^{\mathscr{m}_{1}}_{i-1} x_i}{a^{\mathscr{m}_{2}}_{i-1} x_i}
```
# Arguments
- `sequence::SeqOrView{A}`: A sequence of elements of type `A`.
- `model::BioMarkovChain`: A BioMarkovChain model.
Expand All @@ -233,7 +258,12 @@ function log_odds_ratio_score(
b::Number =
) where A
@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
@assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."

tpm = transition_probability_matrix(sequence)

@assert round.(sum(tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Sequence transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."

lorm = log.(b, tpm ./ model.tpm)
lorm[isnan.(lorm) .| isinf.(lorm)] .= 0.0
return sum(lorm)/length(sequence)
Expand All @@ -250,10 +280,10 @@ P(X_1 = i_1, \ldots, X_T = i_T) = \pi_{i_1}^{T-1} \prod_{t=1}^{T-1} a_{i_t, i_{t
# Arguments
- `sequence::LongNucOrView{4}`: The input sequence of nucleotides.
- `tm::BioMarkovChain` is the actual data structure composed of a `tpm::Matrix{Float64}` the transition probability matrix and `initials=Vector{Float64}` the initial state probabilities.
- `model::BioMarkovChain` is the actual data structure composed of a `tpm::Matrix{Float64}` the transition probability matrix and `initials=Vector{Float64}` the initial state probabilities.
# Returns
- `probability::Float64`: The probability of the input sequence.
- `probability::Float64`: The probability of the input sequence given the model.
# Example
Expand Down Expand Up @@ -291,7 +321,7 @@ function dnaseqprobability(
sequence::NucleicSeqOrView{A},
model::BioMarkovChain
) where A

@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
init = model.inits[_dna_to_int(sequence[1])]

probability = init
Expand Down

0 comments on commit 3ed2876

Please sign in to comment.