diff --git a/Project.toml b/Project.toml index e4bc93f..eb5dd7e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,23 +1,23 @@ name = "BioMarkovChains" uuid = "f861b655-cb5f-42ce-b66a-341b542d4f2c" -authors = ["Camilo García"] +authors = ["Camilo García-Botero"] version = "0.6.0" -[weakdeps] -DiscreteMarkovChains = "8abcb7ef-b365-4f7b-ac38-56893fb62f9f" -MarkovChainHammer = "38c40fd0-bccb-4723-b30d-b2caea0ad8d9" - -[extensions] -DiscreteMarkovChainsExt = "DiscreteMarkovChains" - [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" StatsAPI = "82ae8749-77ed-4fe6-ae5f-f523153014b0" -StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe" VectorizedKmers = "2ef45bd6-4c2b-43d8-88b3-40597287359a" +TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" + +[weakdeps] +DiscreteMarkovChains = "8abcb7ef-b365-4f7b-ac38-56893fb62f9f" +MarkovChainHammer = "38c40fd0-bccb-4723-b30d-b2caea0ad8d9" + +[extensions] +DiscreteMarkovChainsExt = "DiscreteMarkovChains" +MarkovChainHammerExt = "MarkovChainHammer" [compat] BioSequences = "3" @@ -25,7 +25,6 @@ DiscreteMarkovChains = "0.2" MarkovChainHammer = "0.0.11" PrecompileTools = "1" StatsAPI = "1.6" -StatsBase = "0.34.0" TestItemRunner = "0.2" TestItems = "0.1" VectorizedKmers = "0.7" @@ -33,6 +32,7 @@ julia = "1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe" TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" [targets] diff --git a/ext/DiscreteMarkovChainsExt.jl b/ext/DiscreteMarkovChainsExt.jl index df8ec4d..35802d0 100644 --- a/ext/DiscreteMarkovChainsExt.jl +++ b/ext/DiscreteMarkovChainsExt.jl @@ -3,7 +3,9 @@ module DiscreteMarkovChainsExt using BioMarkovChains using DiscreteMarkovChains: DiscreteMarkovChains, DiscreteMarkovChain, - is_absorbing, is_ergodic, is_regular, is_reversible + is_absorbing, is_ergodic, is_regular, is_reversible, + stationary_distribution, first_passage_probabilities, exit_probabilities, + fundamental_matrix, mean_first_passage_time, mean_recurrence_time, mean_time_to_absorption ## Boolean functions ## @@ -25,6 +27,40 @@ function DiscreteMarkovChains.is_reversible(bmc::BioMarkovChain) return is_reversible(DiscreteMarkovChain(bmc.tpm)) end -## +## Probability functions + +import DiscreteMarkovChains: stationary_distribution, first_passage_probabilities, exit_probabilities + +function DiscreteMarkovChains.exit_probabilities(bmc::BioMarkovChain) + return exit_probabilities(DiscreteMarkovChain(bmc.tpm)) +end + +function DiscreteMarkovChains.stationary_distribution(bmc::BioMarkovChain) + return stationary_distribution(DiscreteMarkovChain(bmc.tpm)) +end + +function DiscreteMarkovChains.first_passage_probabilities(bmc::BioMarkovChain, t) # it recieves other args... + return first_passage_probabilities(DiscreteMarkovChain(bmc.tpm), t) +end + +## Mean time functions + +import DiscreteMarkovChains: fundamental_matrix, mean_first_passage_time, mean_recurrence_time, mean_time_to_absorption + +function DiscreteMarkovChains.fundamental_matrix(bmc::BioMarkovChain) + return fundamental_matrix(DiscreteMarkovChain(bmc.tpm)) +end + +function DiscreteMarkovChains.mean_first_passage_time(bmc::BioMarkovChain) + return mean_first_passage_time(DiscreteMarkovChain(bmc.tpm)) +end + +function DiscreteMarkovChains.mean_recurrence_time(bmc::BioMarkovChain) + return mean_recurrence_time(DiscreteMarkovChain(bmc.tpm)) +end + +function DiscreteMarkovChains.mean_time_to_absorption(bmc::BioMarkovChain) + return mean_time_to_absorption(DiscreteMarkovChain(bmc.tpm)) +end end \ No newline at end of file diff --git a/ext/MarkovChainHammerExt.jl b/ext/MarkovChainHammerExt.jl index dec00fc..328ee21 100644 --- a/ext/MarkovChainHammerExt.jl +++ b/ext/MarkovChainHammerExt.jl @@ -1,6 +1,12 @@ -# module MarkovChainHammerExt +module MarkovChainHammerExt -# using BioMarkovChains -# using MarkovChainHammer.Trajectory: generate +using BioMarkovChains +using MarkovChainHammer: generate -# end \ No newline at end of file +# function MarkovChainHammer.generate(pf::Matrix{Float64}, steps::Int64) +# return generate(pf, steps) +# end + +export generate + +end \ No newline at end of file diff --git a/src/BioMarkovChains.jl b/src/BioMarkovChains.jl index f95dd5f..2d30d19 100644 --- a/src/BioMarkovChains.jl +++ b/src/BioMarkovChains.jl @@ -11,11 +11,9 @@ using BioSequences: NucleicAcidAlphabet, DNA, DNAAlphabet, - DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N, #RNA RNA, - RNA_Gap, RNA_A, RNA_C, RNA_M, RNA_G, RNA_R, RNA_S, RNA_V, RNA_U, RNA_W, RNA_Y, RNA_H, RNA_K, RNA_D, RNA_B, RNA_N, #AminoAcids AminoAcid, @@ -24,32 +22,27 @@ using BioSequences: AA_A, AA_R, AA_N, AA_D, AA_C, AA_Q, AA_E, AA_G, AA_H, AA_I, AA_L, AA_K, AA_M, AA_F, AA_P, AA_S, AA_T, AA_W, AA_Y, AA_V, AA_O, AA_U, AA_B, AA_J, AA_Z, AA_X, AA_Term, AA_Gap, # Other functions - SeqOrView, - alphabet, - @biore_str - + SeqOrView, NucleicSeqOrView + using PrecompileTools: @setup_workload, @compile_workload using TestItems: @testitem using StatsAPI: StatsAPI, fit, fit! -using StatsBase: countmap using VectorizedKmers: count_kmers include("types.jl") export BioMarkovChain, BMC include("utils.jl") -export transitions, hasprematurestop, randbmc +export randbmc include("transitions.jl") -export initials, transition_count_matrix, transition_probability_matrix, dnaseqprobability, logoddsratio, logoddsratioscore +export initials, transition_count_matrix, transition_probability_matrix, logoddsratio, logoddsratioscore include("models.jl") export ECOLICDS, ECOLINOCDS include("perronfrobenius.jl") -export perronfrobenius, generatedna - -include("constants.jl") +export perronfrobenius # generatedna include("extended.jl") @@ -61,12 +54,10 @@ include("extended.jl") @compile_workload begin # all calls in this block will be precompiled, regardless of whether # they belong to your package or not (on Julia 1.8 and higher) - transitions(seq) transition_count_matrix(seq) transition_probability_matrix(seq) - dnaseqprobability(seq, ECOLICDS) - BioMarkovChain(seq) perronfrobenius(seq) + BioMarkovChain(seq) end end diff --git a/src/constants.jl b/src/constants.jl index ce6bcd1..d71a8ac 100644 --- a/src/constants.jl +++ b/src/constants.jl @@ -2,25 +2,5 @@ # const LongAminoAcidOrView = Union{LongSequence{<:AminoAcidAlphabet},LongSubSeq{<:AminoAcidAlphabet}} -const NUCLEICINDEXES = Dict(DNA_A => 1, DNA_C => 2, DNA_G => 3, DNA_T => 4) -const DINUCLEICINDEXES = Dict( - LongDNA{4}("AA") => [1, 1], - LongDNA{4}("AC") => [1, 2], - LongDNA{4}("AG") => [1, 3], - LongDNA{4}("AT") => [1, 4], - LongDNA{4}("CA") => [2, 1], - LongDNA{4}("CC") => [2, 2], - LongDNA{4}("CG") => [2, 3], - LongDNA{4}("CT") => [2, 4], - LongDNA{4}("GA") => [3, 1], - LongDNA{4}("GC") => [3, 2], - LongDNA{4}("GG") => [3, 3], - LongDNA{4}("GT") => [3, 4], - LongDNA{4}("TA") => [4, 1], - LongDNA{4}("TC") => [4, 2], - LongDNA{4}("TG") => [4, 3], - LongDNA{4}("TT") => [4, 4], -) - -const AA20 = (AA_A, AA_R, AA_N, AA_D, AA_C, AA_Q, AA_E, AA_G, AA_H, AA_I, AA_L, AA_K, AA_M, AA_F, AA_P, AA_S, AA_T, AA_W, AA_Y, AA_V) \ No newline at end of file +# const AA20 = (AA_A, AA_R, AA_N, AA_D, AA_C, AA_Q, AA_E, AA_G, AA_H, AA_I, AA_L, AA_K, AA_M, AA_F, AA_P, AA_S, AA_T, AA_W, AA_Y, AA_V) \ No newline at end of file diff --git a/src/extended.jl b/src/extended.jl index 2f91d23..30dd5bc 100644 --- a/src/extended.jl +++ b/src/extended.jl @@ -6,7 +6,8 @@ function Base.show(io::IO, model::BioMarkovChain) # println(io, "BioMarkovChain:") # Determine the alphabet type - alphabet_type = length(model) == 20 ? "AminoAcids" : "DNA/RNA" + # alphabet_type = length(model) == 20 ? "AminoAcids" : "DNA/RNA" + alphabet_type = eltype(model) # Print the type name with inferred alphabet type println(io, "BioMarkovChain with $alphabet_type Alphabet:") @@ -47,7 +48,7 @@ end Base.length(bmc::BioMarkovChain) = length(bmc.inits) function Base.eltype(bmc::BioMarkovChain) - return length(bmc) == 4 ? DNA : AminoAcid + return bmc.statespace end """ diff --git a/src/models.jl b/src/models.jl index 89bbcce..48cbeba 100644 --- a/src/models.jl +++ b/src/models.jl @@ -8,7 +8,7 @@ const ECOLICDS = begin inits = [0.245, 0.243, 0.273, 0.239] - BMC(tpm, inits) + BMC(DNA, tpm, inits) end const ECOLINOCDS = begin @@ -21,7 +21,7 @@ const ECOLINOCDS = begin inits = [0.262, 0.239, 0.240, 0.259] - BMC(tpm, inits) + BMC(DNA, tpm, inits) end @@ -35,7 +35,7 @@ const CPGPOS = begin inits = [0.262, 0.239, 0.240, 0.259] # not stablished - BMC(tpm, inits) + BMC(DNA, tpm, inits) end @@ -49,5 +49,5 @@ const CPGNEG = begin inits = [0.262, 0.239, 0.240, 0.259] # not stablished - BMC(tpm, inits) + BMC(DNA, tpm, inits) end \ No newline at end of file diff --git a/src/perronfrobenius.jl b/src/perronfrobenius.jl index d4d4d85..6245e2c 100644 --- a/src/perronfrobenius.jl +++ b/src/perronfrobenius.jl @@ -1,14 +1,14 @@ ### MarkdovChainHammer.jl ### """ - perronfrobenius(sequence::LongNucOrView{4}, n::Int64=1) + perronfrobenius(sequence::NucleicSeqOrView{A}, n::Int64=1) where A Compute the Perron-Frobenius matrix, a column-wise version of the transition probability matrix (TPM), for a given nucleotide sequence. The Perron-Frobenius matrix captures the asymptotic probabilities of transitioning between nucleotides in the sequence over a specified number of steps `n`. It provides insight into the long-term behavior of a Markov chain or a dynamical system associated with the sequence. # Arguments -- `sequence::LongNucOrView{4}`: A nucleotide sequence represented as a `LongNucOrView{4}` object. +- `sequence::NucleicSeqOrView{A}`: A nucleotide sequence represented as a `NucleicSeqOrView{A}` object. - `n::Int64=1`: The number of steps to consider for the transition probability matrix. Default is 1. # Returns @@ -20,7 +20,7 @@ sequence = LongSequence{DNAAlphabet{4}}("ACGTCGTCCACTACGACATCAGC") # Replace wi n = 2 pf = perronfrobenius(sequence, n) """ -function perronfrobenius(sequence::LongNucOrView{4}; n::Int64=1) +function perronfrobenius(sequence::NucleicSeqOrView{A}; n::Int64=1) where A tpm = transition_probability_matrix(sequence, n) return copy(tpm') end @@ -66,12 +66,12 @@ pf = perronfrobenius(orfdna) newdna = generatedna(pf, 100) ``` """ -function generatedna(pf::Matrix{Float64}, steps::Int64; extended_alphabet::Bool = false) - newseq = LongDNA{4}() - # pf = transpose(tpm) # The Perron-Frobenius matrix - trajectory = MarkovChainHammer.generate(pf, steps) - for i in trajectory - newseq = append!(newseq, _int_to_dna(i; extended_alphabet)) - end - return newseq -end \ No newline at end of file +# function generatedna(pf::Matrix{Float64}, steps::Int64; extended_alphabet::Bool = false) +# newseq = LongDNA{4}() +# # pf = transpose(tpm) # The Perron-Frobenius matrix +# trajectory = generate(pf, steps) +# for i in trajectory +# newseq = append!(newseq, _int_to_dna(i; extended_alphabet)) +# end +# return newseq +# end \ No newline at end of file diff --git a/src/transitions.jl b/src/transitions.jl index b438f71..fa57470 100644 --- a/src/transitions.jl +++ b/src/transitions.jl @@ -22,21 +22,18 @@ tcm = transition_count_matrix(seq) 2 0 0 0 ``` """ -function transition_count_matrix(sequence::LongNucOrView{N}) where N +function transition_count_matrix(sequence::NucleicSeqOrView{A}) where A counts = reshape(count_kmers(sequence, 2), (4,4))' return copy(counts) end -function transition_count_matrix(sequence::LongAminoAcidOrView) - matrix = [(i,j) for i in AA20, j in AA20] - trans = transitions(sequence) - return reshape([get(trans, t, 0) for t in matrix], size(matrix)) +function transition_count_matrix(sequence::LongSequence{<:AminoAcidAlphabet}) + counts = reshape(count_kmers(sequence, 2), (20,20))' + return copy(counts) end -# function transition_count_matrix(sequence::LongNucOrView{N}) where N -# alphabetsymb = eltype(sequence) == DNA ? ACGT : ACGU # could eventually use `∘(sort, unique)(sequence)` to get a very specific and sorted alphabetsymbol -# # alphabetsymb = ∘(sort, unique)(sequence) -# matrix = [(i,j) for i in alphabetsymb, j in alphabetsymb] +# function transition_count_matrix(sequence::LongAminoAcidOrView) +# matrix = [(i,j) for i in AA20, j in AA20] # trans = transitions(sequence) # return reshape([get(trans, t, 0) for t in matrix], size(matrix)) # end @@ -108,17 +105,6 @@ function transition_probability_matrix(sequence::SeqOrView{A}, n::Int64 = 1) whe return n > 1 ? freqs^n : freqs end -# function transition_probability_matrix(sequence::SeqOrView{A}, n::Int64 = 1) where A -# tcm = transition_count_matrix(sequence) -# rowsums = sum(tcm, dims = 2) -# freqs = tcm ./ rowsums - -# freqs[isinf.(freqs)] .= 0.0 -# freqs[isnan.(freqs)] .= 0.0 - -# return freqs^(n) -# end - transition_probability_matrix(bmc::BioMarkovChain) = bmc.tpm @testitem "tpm" begin @@ -134,7 +120,6 @@ transition_probability_matrix(bmc::BioMarkovChain) = bmc.tpm @test round.(tpm02, digits = 3) == [0.0 0.0 0.0 0.0; 0.0 0.5 0.2 0.3; 0.0 0.375 0.625 0.0; 0.0 0.667 0.333 0.0] end - @doc raw""" initials(sequence::SeqOrView{A}) where A @@ -172,140 +157,16 @@ end initials(bmc::BioMarkovChain) = bmc.inits -@doc raw""" - sequenceprobability(sequence::LongNucOrView{4}, model::BioMarkovChain) - -Compute the probability of a given sequence using a transition probability matrix and the initial probabilities distributions of a `BioMarkovModel`. - -```math -P(X_1 = i_1, \ldots, X_T = i_T) = \pi_{i_1}^{T-1} \prod_{t=1}^{T-1} a_{i_t, i_{t+1}} -``` - -# Arguments -- `sequence::LongNucOrView{4}`: The input sequence of nucleotides. -- `tm::BioMarkovChain` is the actual data structure composed of a `tpm::Matrix{Float64}` the transition probability matrix and `initials=Vector{Float64}` the initial state probabilities. - -# Returns -- `probability::Float64`: The probability of the input sequence. - -# Example - -``` -mainseq = LongDNA{4}("CCTCCCGGACCCTGGGCTCGGGAC") - -bmc = BioMarkovChain(mainseq) - -BioMarkovChain with DNA Alphabet: - - Transition Probability Matrix -> Matrix{Float64}(4 × 4): - 0.0 1.0 0.0 0.0 - 0.0 0.5 0.2 0.3 - 0.25 0.125 0.625 0.0 - 0.0 0.6667 0.3333 0.0 - - Initial Probabilities -> Vector{Float64}(4 × 1): - 0.087 - 0.4348 - 0.3478 - 0.1304 - - Markov Chain Order -> Int64: - 1 - -newseq = LongDNA{4}("CCTG") - - 4nt DNA Sequence: - CCTG - - -dnaseqprobability(newseq, bmc) - - 0.0217 -``` -""" -function dnaseqprobability( - sequence::LongNucOrView{4}, - model::BioMarkovChain -) - init = model.inits[NUCLEICINDEXES[sequence[1]]] - - probability = init - - for t in 1:length(sequence)-1 - i, j = DINUCLEICINDEXES[@view sequence[t:t+1]] - probability *= model.tpm[i, j] - end - return probability -end - # findfirst(i -> i == (AA_T, AA_R), aamatrix) -@doc raw""" - iscoding( - sequence::LongSequence{DNAAlphabet{4}}, - codingmodel::BioMarkovChain, - noncodingmodel::BioMarkovChain, - η::Float64 = 1e-5 - ) - -Check if a given DNA sequence is likely to be coding based on a log-odds ratio. - The log-odds ratio is a statistical measure used to assess the likelihood of a sequence being coding or non-coding. It compares the probability of the sequence generated by a coding model to the probability of the sequence generated by a non-coding model. If the log-odds ratio exceeds a given threshold (`η`), the sequence is considered likely to be coding. - It is formally described as a decision rule: - -```math -S(X) = \log \left( \frac{{P_C(X_1=i_1, \ldots, X_T=i_T)}}{{P_N(X_1=i_1, \ldots, X_T=i_T)}} \right) \begin{cases} > \eta & \Rightarrow \text{{coding}} \\ < \eta & \Rightarrow \text{{noncoding}} \end{cases} -``` - -# Arguments -- `sequence::LongSequence{DNAAlphabet{4}}`: The DNA sequence to be evaluated. -- `codingmodel::BioMarkovChain`: The transition model for coding regions. -- `noncodingmodel::BioMarkovChain`: The transition model for non-coding regions. -- `η::Float64 = 1e-5`: The threshold value (eta) for the log-odds ratio (default: 1e-5). - -# Returns -- `true` if the sequence is likely to be coding. -- `false` if the sequence is likely to be non-coding. - -# Raises -- `ErrorException`: if the length of the sequence is not divisible by 3. -- `ErrorException`: if the sequence contains a premature stop codon. - -# Example - -``` -sequence = LondDNA("ATGGCATCTAG") -codingmodel = BioMarkovChain() -noncodingmodel = BioMarkovChain() -iscoding(sequence, codingmodel, noncodingmodel) # Returns: true -``` """ -function iscoding( - sequence::LongNucOrView{4}, - codingmodel::BioMarkovChain, - noncodingmodel::BioMarkovChain, - η::Float64 = 1e-5 -) - pcoding = dnaseqprobability(sequence, codingmodel) - pnoncoding = dnaseqprobability(sequence, noncodingmodel) - - logodds = log(pcoding / pnoncoding) - - length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") - - !hasprematurestop(sequence) || error("There is a premature stop codon in the sequence") - - if logodds > η - return true - else - false - end -end - -""" - logoddsratio(sequence::LongNucOrView{4}, model::BioMarkovChain) + logoddsratio(sequence::NucleicSeqOrView{A}, model::BioMarkovChain) where A Calculates the log-odds ratio between the transition probability matrix of a given DNA sequence and a reference model. # Arguments -- `sequence::LongNucOrView{4}`: A DNA sequence or view with a length of 4 nucleotides. +- `sequence::NucleicSeqOrView{A}`: A DNA, RNA sequence or view with a length of 4 nucleotides. - `model::BioMarkovChain`: A reference BioMarkovChain model. @@ -313,16 +174,15 @@ Calculates the log-odds ratio between the transition probability matrix of a giv ```julia sequence = LongNucOrView{4}("ACGT") -model = BioMarkovChain(...) # Provide appropriate initialization for BioMarkovChain +model = BioMarkovChain(sequence) # Provide appropriate initialization for BioMarkovChain result = logoddsratio(sequence, model) ``` - """ function logoddsratio( - sequence::LongNucOrView{4}, + sequence::NucleicSeqOrView{A}, model::BioMarkovChain; b::Number = ℯ -) +) where A tpm = transition_probability_matrix(sequence) return log.(b, tpm./model.tpm) @@ -348,10 +208,10 @@ function logoddsratio( end function logoddsratioscore( - sequence::LongNucOrView{4}, + sequence::NucleicSeqOrView{A}, model::BioMarkovChain; b::Number = ℯ -) +) where A tpm = transition_probability_matrix(sequence) return sum(log.(b, tpm./model.tpm)) / length(sequence) diff --git a/src/types.jl b/src/types.jl index 67bc179..e0e0c87 100644 --- a/src/types.jl +++ b/src/types.jl @@ -1,12 +1,12 @@ -# abstract type AbstractBioMarkovChain <: AbstractDiscreteMarkovChain end abstract type AbstractBioMarkovChain end """ - struct BioMarkovChain{M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain + struct BioMarkovChain{S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain A BioMarkovChain represents a Markov chain used in biological sequence analysis. It contains a transition probability matrix (tpm) and an initial distribution of probabilities (inits) and also the order of the Markov chain. # Fields +- `statespace::S`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s. - `tpm::M`: The transition probability matrix. - `inits::I`: The initial distribution of probabilities. - `n::N`: The order of the Markov chain. @@ -34,28 +34,29 @@ BioMarkovChain: - Markov Chain Order:2 ``` """ -struct BioMarkovChain{M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain - tpm::M # The probabilities of the TransitionProbabilityMatrix struct - inits::I # the initials distribution of probabilities - n::N # The order of the Markov chain - function BioMarkovChain(tpm::M, inits::I, n::N=1) where {M<:AbstractMatrix, I<:AbstractVector, N<:Integer} - bmc = new{M,I,N}(n > 1 ? tpm^n : tpm, inits, n) - return bmc - end +struct BioMarkovChain{S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain + statespace::S # The sequence DataType (DNA,RNA,AminoAcid) + tpm::M # The probabilities of the TransitionProbabilityMatrix struct + inits::I # the initials distribution of probabilities + n::N # The order of the Markov chain + function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} + bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n) + return bmc + end - function BioMarkovChain(sequence::SeqOrView{A}, n::Int64=1) where A - inits = Array{Float64, 1}(undef, 1) - tcm = transition_count_matrix(sequence) - inits = vec(sum(tcm, dims = 1) ./ sum(tcm)) + function BioMarkovChain(sequence::SeqOrView{A}, n::Int64=1) where A + inits = Array{Float64, 1}(undef, 1) + tcm = transition_count_matrix(sequence) + inits = vec(sum(tcm, dims = 1) ./ sum(tcm)) - rowsums = sum(tcm, dims = 2) - freqs = tcm ./ rowsums + rowsums = sum(tcm, dims = 2) + freqs = tcm ./ rowsums - freqs[isnan.(freqs) .| isinf.(freqs)] .= 0.0 # Handling NaN and Inf + freqs[isnan.(freqs) .| isinf.(freqs)] .= 0.0 # Handling NaN and Inf - bmc = new{Matrix{Float64},Vector{Float64},Int64}(n > 1 ? freqs^n : freqs, inits, n) - return bmc - end + bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? freqs^n : freqs, inits, n) + return bmc + end end """ diff --git a/src/utils.jl b/src/utils.jl index f45af2f..4621000 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,62 +1,3 @@ -const LongNucOrView{N} = Union{LongSequence{<:NucleicAcidAlphabet{N}},LongSubSeq{<:NucleicAcidAlphabet{N}}} - -const LongAminoAcidOrView = Union{LongSequence{<:AminoAcidAlphabet},LongSubSeq{<:AminoAcidAlphabet}} - -""" - transitions(sequence::LongSequence) - -Compute the transition counts of each pair in a given biological sequence sequence. - -# Arguments -- `sequence::LongSequence{DNAAlphabet{4}}`: a `LongSequence{DNAAlphabet{4}}` object representing the DNA sequence. - -# Returns -A dictionary with keys being `Dict{Tuple{DNA, DNA}, Int64}` objects representing -the dinucleotides, and values being the number of occurrences of each dinucleotide -in the sequence. - -# Example -``` -seq = dna"AGCTAGCTAGCT" - -dinucleotides(seq) -Dict{Tuple{DNA, DNA}, Int64} with 4 entries: - (DNA_C, DNA_T) => 3 - (DNA_G, DNA_C) => 3 - (DNA_T, DNA_A) => 2 - (DNA_A, DNA_G) => 3 -``` -""" -function transitions(sequence::SeqOrView{A}) where A - b = @view sequence[2:end] - return countmap(zip(sequence, b)) -end - -""" - hasprematurestop(sequence::LongNucOrView{4})::Bool - -Determine whether the `sequence` of type `LongSequence{DNAAlphabet{4}}` contains a premature stop codon. - -Returns a boolean indicating whether the `sequence` has more than one stop codon. -""" -function hasprematurestop(sequence::LongNucOrView{N})::Bool where N - - stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")] # Create a set of stop codons - - length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") - - occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence") - - @inbounds for i in 1:3:length(sequence) - 4 - codon = sequence[i:i+2] - if codon in stopcodons - return true - end - end - - return false -end - function _int_to_dna(index::Int64; extended_alphabet::Bool = false) A = extended_alphabet ? [DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N] : [DNA_A, DNA_C, DNA_G, DNA_T] return LongSequence{DNAAlphabet{4}}([A[index]]) @@ -67,15 +8,15 @@ function _dna_to_int(nucleotide::DNA; extended_alphabet::Bool = false) return findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A)) end -function randbmc(A::DataType, n::Int64=1) - if A == DNA || A == RNA +function randbmc(statespace::DataType, n::Int64=1) + if statespace == DNA || statespace == RNA tpm = rand(4, 4) row_sums = sum(tpm, dims=2) inits = rand(4) init_sum = sum(inits) @views tpm ./= row_sums @views inits ./= init_sum - elseif A == AminoAcid + elseif statespace == AminoAcid tpm = rand(20, 20) row_sums = sum(tpm, dims=2) inits = rand(20) @@ -86,5 +27,5 @@ function randbmc(A::DataType, n::Int64=1) error("Alphabet must be of the DNA, RNA or AminoAcid DataType") end - return BMC(tpm, inits, n) + return BMC(statespace, tpm, inits, n) end \ No newline at end of file