From efd1d8b73bcbef804800adb70910e2bbae965809 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 23 Jun 2023 20:55:53 +0200 Subject: [PATCH 01/33] Use JuliaFormatter --- .JuliaFormatter.toml | 8 + docs/make.jl | 36 +- src/Kmers.jl | 12 +- src/counting.jl | 48 ++- src/indexing.jl | 11 +- src/kmer.jl | 210 +++++----- src/kmer_iteration/AbstractKmerIterator.jl | 39 +- src/kmer_iteration/EveryCanonicalKmer.jl | 82 ++-- src/kmer_iteration/EveryKmer.jl | 96 +++-- src/kmer_iteration/SpacedCanonicalKmers.jl | 79 ++-- src/kmer_iteration/SpacedKmers.jl | 61 ++- src/predicates.jl | 8 +- src/revtrans.jl | 24 +- src/transformations.jl | 94 ++--- src/tuple_bitflipping.jl | 36 +- test/access.jl | 24 +- test/biosequences_interface.jl | 61 ++- test/comparisons.jl | 34 +- test/construction_and_conversion.jl | 424 ++++++++++++++++++--- test/debruijn_neighbors.jl | 12 +- test/find.jl | 8 +- test/iteration.jl | 252 ++++++------ test/mismatches.jl | 10 +- test/order.jl | 14 +- test/print.jl | 12 +- test/runtests.jl | 2 +- test/transformations.jl | 20 +- test/translation.jl | 272 +++++++------ test/utils.jl | 67 +++- 29 files changed, 1350 insertions(+), 706 deletions(-) create mode 100644 .JuliaFormatter.toml diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..903da37 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,8 @@ +always_for_in = true +whitespace_typedefs = true +whitespace_ops_in_indices = true +remove_extra_newlines = true +import_to_using = true +normalize_line_endings = "unix" +separate_kwargs_with_semicolon = true +whitespace_in_kwargs = false diff --git a/docs/make.jl b/docs/make.jl index 61b4844..200452b 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,29 +1,29 @@ using Documenter, Kmers -makedocs( - format = Documenter.HTML(), - sitename = "Kmers.jl", - pages = [ - "Home" => "index.md", - "Kmer types" => "kmer_types.md", - "Constructing kmers" => "construction.md", - "Indexing & modifying kmers" => "transforms.md", - "Predicates" => "predicates.md", - "Random kmers" => "random.md", - "Iterating over Kmers" => "iteration.md", - "Translation" => "translate.md", +makedocs(; + format=Documenter.HTML(), + sitename="Kmers.jl", + pages=[ + "Home" => "index.md", + "Kmer types" => "kmer_types.md", + "Constructing kmers" => "construction.md", + "Indexing & modifying kmers" => "transforms.md", + "Predicates" => "predicates.md", + "Random kmers" => "random.md", + "Iterating over Kmers" => "iteration.md", + "Translation" => "translate.md", #"Pattern matching and searching" => "sequence_search.md", #"Iteration" => "iteration.md", #"Counting" => "counting.md", #"I/O" => "io.md", #"Interfaces" => "interfaces.md" ], - authors = "Ben J. Ward, The BioJulia Organisation and other contributors." + authors="Ben J. Ward, The BioJulia Organisation and other contributors.", ) -deploydocs( - repo = "github.com/BioJulia/Kmers.jl.git", - push_preview = true, - deps = nothing, - make = nothing +deploydocs(; + repo="github.com/BioJulia/Kmers.jl.git", + push_preview=true, + deps=nothing, + make=nothing, ) diff --git a/src/Kmers.jl b/src/Kmers.jl index be1a2c5..f08aa19 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -80,7 +80,7 @@ export AA_X, AA_Term, AA_Gap, - + # BioSequences re-exports Alphabet, BioSequence, @@ -89,8 +89,7 @@ export DNAAlphabet, RNAAlphabet, translate, - - + ### ### Mers ### @@ -125,11 +124,11 @@ export reverse_translate, reverse_translate!, ReverseGeneticCode, - + ### ### Sequence literals ### - + @mer_str, @bigmer_str @@ -139,7 +138,8 @@ ispermitted(::DNAAlphabet{2}, nt::DNA) = count_ones(nt) == 1 && isvalid(nt) ispermitted(::DNAAlphabet{2}, data::UInt) = data < UInt(4) ispermitted(::DNAAlphabet{4}, nt::DNA) = isvalid(nt) ispermitted(::DNAAlphabet{4}, data::UInt) = isvalid(DNA, data) -ispermitted(::AminoAcidAlphabet, aa::AminoAcid) = reinterpret(UInt8, aa) <= reinterpret(UInt8, AA_Gap) +ispermitted(::AminoAcidAlphabet, aa::AminoAcid) = + reinterpret(UInt8, aa) <= reinterpret(UInt8, AA_Gap) ispermitted(::AminoAcidAlphabet, data::UInt) = data <= 0x1b include("kmer.jl") diff --git a/src/counting.jl b/src/counting.jl index cbd9340..edefc1e 100644 --- a/src/counting.jl +++ b/src/counting.jl @@ -2,30 +2,46 @@ ### Mer specific specializations of src/biosequence/counting.jl ### -for i in [(:_count_a, :a_bitcount), (:_count_c, :c_bitcount), (:_count_g, :g_bitcount), (:_count_t, :t_bitcount)] +for i in [ + (:_count_a, :a_bitcount), + (:_count_c, :c_bitcount), + (:_count_g, :g_bitcount), + (:_count_t, :t_bitcount), +] @eval begin - @inline function $(i[1])(alph::A, head::UInt64, tail...) where {A<:NucleicAcidAlphabet} + @inline function $(i[1])( + alph::A, + head::UInt64, + tail..., + ) where {A <: NucleicAcidAlphabet} return $BioSequences.$(i[2])(head, alph) + $(i[1])(alph, tail...) end - @inline $(i[1])(alph::A) where {A<:NucleicAcidAlphabet} = 0 + @inline $(i[1])(alph::A) where {A <: NucleicAcidAlphabet} = 0 end end -@inline function _count_gc(alph::A, head::UInt64, tail...) where {A<:NucleicAcidAlphabet} +@inline function _count_gc(alph::A, head::UInt64, tail...) where {A <: NucleicAcidAlphabet} return BioSequences.gc_bitcount(head, alph) + _count_gc(alph, tail...) end -@inline _count_gc(::A) where {A<:NucleicAcidAlphabet} = 0 +@inline _count_gc(::A) where {A <: NucleicAcidAlphabet} = 0 -count_a(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_a(A(), x.data...) - n_unused(x) -count_c(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_c(A(), x.data...) -count_g(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_g(A(), x.data...) -count_t(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_t(A(), x.data...) +count_a(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = + _count_a(A(), x.data...) - n_unused(x) +count_c(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = _count_c(A(), x.data...) +count_g(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = _count_g(A(), x.data...) +count_t(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = _count_t(A(), x.data...) -count_gc(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_gc(A(), x.data...) -Base.count(::typeof(isGC), x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = count_gc(x) +count_gc(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = + _count_gc(A(), x.data...) +Base.count(::typeof(isGC), x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = + count_gc(x) # TODO: Expand to Amino Acid Kmers as well... -@inline function Base.count(::typeof(!=), a::Kmer{A,K,N}, b::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} +@inline function Base.count( + ::typeof(!=), + a::Kmer{A, K, N}, + b::Kmer{A, K, N}, +) where {A <: NucleicAcidAlphabet, K, N} ad = a.data bd = b.data sum = 0 @@ -36,7 +52,11 @@ Base.count(::typeof(isGC), x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = end # TODO: Expand to Amino Acid Kmers as well... -@inline function Base.count(::typeof(==), a::Kmer{A,K,N}, b::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} +@inline function Base.count( + ::typeof(==), + a::Kmer{A, K, N}, + b::Kmer{A, K, N}, +) where {A <: NucleicAcidAlphabet, K, N} ad = a.data bd = b.data sum = 0 @@ -44,4 +64,4 @@ end sum += BioSequences.match_bitcount(ad[i], bd[i], A()) end return sum - n_unused(a) -end \ No newline at end of file +end diff --git a/src/indexing.jl b/src/indexing.jl index 021926a..1221466 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -9,8 +9,11 @@ end @inline encoded_data(x::Kmer) = x.data -@inline BioSequences.bitindex(seq::Kmer, i::Integer) = BioSequences.bitindex(BioSequences.BitsPerSymbol(seq), BioSequences.encoded_data_eltype(typeof(seq)), i + n_unused(seq)) - +@inline BioSequences.bitindex(seq::Kmer, i::Integer) = BioSequences.bitindex( + BioSequences.BitsPerSymbol(seq), + BioSequences.encoded_data_eltype(typeof(seq)), + i + n_unused(seq), +) """ Base.getindex(seq::Kmer, i::UnitRange) @@ -21,7 +24,7 @@ Slice a Kmer by a UnitRange. Using this function will introduce performance penalties in your code if you pass values of `i` that are not constants that can be propagated. """ -@inline function Base.getindex(seq::Kmer{A}, i::UnitRange) where A +@inline function Base.getindex(seq::Kmer{A}, i::UnitRange) where {A} @boundscheck Base.checkbounds(seq, i) ind(s, i) = BioSequences.index(BioSequences.bitindex(s, i)) off(s, i) = BioSequences.offset(BioSequences.bitindex(s, i)) @@ -32,4 +35,4 @@ Slice a Kmer by a UnitRange. data = Kmers.rightshift_carry(seq.data, rshift) T = Kmers.kmertype(Kmer{A, length(i)}) return T(data[start:stop]) -end \ No newline at end of file +end diff --git a/src/kmer.jl b/src/kmer.jl index b71a57a..b956783 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -39,23 +39,25 @@ of the type are still defined. available, since they can return a new kmer value as a result e.g. `reverse_complement`. """ -struct Kmer{A<:Alphabet,K,N} <: BioSequence{A} - data::NTuple{N,UInt64} - +struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} + data::NTuple{N, UInt64} + # This unsafe method do not clip the head - Kmer{A,K,N}(::Unsafe, data::NTuple{N,UInt64}) where {A<:Alphabet,K,N} = new{A,K,N}(data) + Kmer{A, K, N}(::Unsafe, data::NTuple{N, UInt64}) where {A <: Alphabet, K, N} = + new{A, K, N}(data) - function Kmer{A,K,N}(data::NTuple{N,UInt64}) where {A<:Alphabet,K,N} - checkmer(Kmer{A,K,N}) - x = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(A()) + function Kmer{A, K, N}(data::NTuple{N, UInt64}) where {A <: Alphabet, K, N} + checkmer(Kmer{A, K, N}) + x = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) return new(_cliphead(x, data...)) end end -BioSequences.encoded_data(seq::Kmer{A,K,N}) where {A,K,N} = seq.data +BioSequences.encoded_data(seq::Kmer{A, K, N}) where {A, K, N} = seq.data # Create a blank ntuple of appropriate length for a given Kmer with N. -@inline blank_ntuple(::Type{Kmer{A,K,N}}) where {A,K,N} = ntuple(x -> zero(UInt64), Val{N}()) +@inline blank_ntuple(::Type{Kmer{A, K, N}}) where {A, K, N} = + ntuple(x -> zero(UInt64), Val{N}()) ### ### _build_kmer_data @@ -78,24 +80,28 @@ This particular method is specialised for LongSequences, and for when the Kmer and LongSequence types used, share the same alphabet, since a lot of encoding / decoding can be skipped, and the problem is mostly one of shunting bits around. """ -@inline function _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} - checkmer(Kmer{A,K,N}) - +@inline function _build_kmer_data( + ::Type{Kmer{A, K, N}}, + seq::LongSequence{A}, + from::Int=1, +) where {A, K, N} + checkmer(Kmer{A, K, N}) + bits_per_sym = BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - n_head = elements_in_head(Kmer{A,K,N}) # Based on kmer type, should constant fold. - n_per_chunk = per_word_capacity(Kmer{A,K,N}) # Based on kmer type, should constant fold. - + n_head = elements_in_head(Kmer{A, K, N}) # Based on kmer type, should constant fold. + n_per_chunk = per_word_capacity(Kmer{A, K, N}) # Based on kmer type, should constant fold. + if from + K - 1 > length(seq) return nothing end - + # Construct the head. head = zero(UInt64) @inbounds for i in from:(from + n_head - 1) bits = UInt64(BioSequences.extract_encoded_element(seq, i)) head = (head << bits_per_sym) | bits end - + # And the rest of the sequence idx = Ref(from + n_head) tail = ntuple(Val{N - 1}()) do i @@ -108,13 +114,11 @@ decoding can be skipped, and the problem is mostly one of shunting bits around. end return body end - + # Put head and tail together return (head, tail...) end - - ### ### Constructors ### @@ -140,28 +144,28 @@ DNA 5-mer: TTAGC ``` """ -function Kmer{A,K,N}(itr) where {A,K,N} - checkmer(Kmer{A,K,N}) - +function Kmer{A, K, N}(itr) where {A, K, N} + checkmer(Kmer{A, K, N}) + seqlen = length(itr) if seqlen != K throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)")) end - + ## All based on alphabet type of Kmer, so should constant fold. bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A,K,N}) - n_per_chunk = per_word_capacity(Kmer{A,K,N}) - + n_head = elements_in_head(Kmer{A, K, N}) + n_per_chunk = per_word_capacity(Kmer{A, K, N}) + # Construct the head. head = zero(UInt64) @inbounds for i in 1:n_head (x, next_i) = iterate(itr, i) - sym = convert(eltype(Kmer{A,K,N}), x) + sym = convert(eltype(Kmer{A, K, N}), x) # Encode will throw if it cant encode an element. head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) end - + # And the rest of the sequence idx = Ref(n_head + 1) tail = ntuple(Val{N - 1}()) do i @@ -169,17 +173,17 @@ function Kmer{A,K,N}(itr) where {A,K,N} body = zero(UInt64) @inbounds for i in 1:n_per_chunk (x, next_idx) = iterate(itr, idx[]) - sym = convert(eltype(Kmer{A,K,N}), x) + sym = convert(eltype(Kmer{A, K, N}), x) # Encode will throw if it cant encode an element. body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) idx[] += 1 end return body end - + data = (head, tail...) - - return Kmer{A,K,N}(data) + + return Kmer{A, K, N}(data) end """ @@ -205,26 +209,26 @@ DNA 5-mer: TTAGC ``` """ -@inline function Kmer{A,K,N}(seq::BioSequence{A}) where {A,K,N} - checkmer(Kmer{A,K,N}) - +@inline function Kmer{A, K, N}(seq::BioSequence{A}) where {A, K, N} + checkmer(Kmer{A, K, N}) + seqlen = length(seq) if seqlen != K throw(ArgumentError("seq is not the correct length ($seqlen ≠ $K)")) end - + ## All based on alphabet type of Kmer, so should constant fold. bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A,K,N}) - n_per_chunk = per_word_capacity(Kmer{A,K,N}) - + n_head = elements_in_head(Kmer{A, K, N}) + n_per_chunk = per_word_capacity(Kmer{A, K, N}) + # Construct the head. head = zero(UInt64) @inbounds for i in 1:n_head bits = UInt64(BioSequences.extract_encoded_element(seq, i)) head = (head << bits_per_sym) | bits end - + # And the rest of the sequence idx = Ref(n_head + 1) tail = ntuple(Val{N - 1}()) do i @@ -237,12 +241,11 @@ TTAGC end return body end - + data = (head, tail...) - - return Kmer{A,K,N}(data) -end + return Kmer{A, K, N}(data) +end # Convenience version of function above so you don't have to work out correct N. """ @@ -253,8 +256,8 @@ Construct a `Kmer{A,K,N}` from an iterable. This is a convenience method which will work out the correct `N` parameter, for your given choice of `A` & `K`. """ -@inline function Kmer{A,K}(itr) where {A,K} - T = kmertype(Kmer{A,K}) +@inline function Kmer{A, K}(itr) where {A, K} + T = kmertype(Kmer{A, K}) return T(itr) end @@ -269,14 +272,18 @@ the correct `N` parameter, for your given choice of `A` & `K`. !!! warning Since this gets K from runtime values, this is gonna be slow! """ -@inline Kmer{A}(itr) where {A} = Kmer{A,length(itr)}(itr) -@inline Kmer(seq::BioSequence{A}) where A = Kmer{A}(seq) +@inline Kmer{A}(itr) where {A} = Kmer{A, length(itr)}(itr) +@inline Kmer(seq::BioSequence{A}) where {A} = Kmer{A}(seq) -function Kmer{A1}(seq::BioSequence{A2}) where {A1 <: NucleicAcidAlphabet, A2 <: NucleicAcidAlphabet} +function Kmer{A1}( + seq::BioSequence{A2}, +) where {A1 <: NucleicAcidAlphabet, A2 <: NucleicAcidAlphabet} kmertype(Kmer{A1, length(seq)})(seq) end -@inline function Kmer{A}(nts::Vararg{Union{DNA, RNA}, K}) where {A <: NucleicAcidAlphabet, K} +@inline function Kmer{A}( + nts::Vararg{Union{DNA, RNA}, K}, +) where {A <: NucleicAcidAlphabet, K} return kmertype(Kmer{A, K})(nts) end @@ -310,7 +317,6 @@ UUAGC """ @inline Kmer(nt::RNA, nts::Vararg{RNA}) = RNAKmer((nt, nts...)) - """ Kmer(seq::String) @@ -340,10 +346,9 @@ TTAGC throw(ArgumentError("Can't detect alphabet type from string")) end A = ifelse(hast & !hasu, DNAAlphabet{2}, RNAAlphabet{2}) - return Kmer{A,length(seq′)}(seq′) + return Kmer{A, length(seq′)}(seq′) end - """ kmertype(::Type{Kmer{A,K}}) where {A,K} Resolve and incomplete kmer typing, computing the N parameter of @@ -356,59 +361,61 @@ julia> kmertype(DNAKmer{63}) Kmer{DNAAlphabet{2},63,2} ``` """ -@inline function kmertype(::Type{Kmer{A,K}}) where {A,K} - return Kmer{A,K,BioSequences.seq_data_len(A, K)} +@inline function kmertype(::Type{Kmer{A, K}}) where {A, K} + return Kmer{A, K, BioSequences.seq_data_len(A, K)} end -@inline kmertype(::Type{Kmer{A,K,N}}) where {A,K,N} = Kmer{A,K,N} +@inline kmertype(::Type{Kmer{A, K, N}}) where {A, K, N} = Kmer{A, K, N} # Aliases "Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`" -const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N} +const DNAKmer{K, N} = Kmer{DNAAlphabet{2}, K, N} "Shortcut for the type `DNAKmer{27,1}`" -const DNA27mer = DNAKmer{27,1} +const DNA27mer = DNAKmer{27, 1} "Shortcut for the type `DNAKmer{31,1}`" -const DNA31mer = DNAKmer{31,1} +const DNA31mer = DNAKmer{31, 1} "Shortcut for the type `DNAKmer{63,2}`" -const DNA63mer = DNAKmer{63,2} +const DNA63mer = DNAKmer{63, 2} "Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`" -const RNAKmer{K,N} = Kmer{RNAAlphabet{2},K,N} +const RNAKmer{K, N} = Kmer{RNAAlphabet{2}, K, N} "Shortcut for the type `RNAKmer{27,1}`" -const RNA27mer = RNAKmer{27,1} +const RNA27mer = RNAKmer{27, 1} "Shortcut for the type `RNAKmer{31,1}`" -const RNA31mer = RNAKmer{31,1} +const RNA31mer = RNAKmer{31, 1} "Shortcut for the type `RNAKmer{63,2}`" -const RNA63mer = RNAKmer{63,2} +const RNA63mer = RNAKmer{63, 2} "Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`" -const AAKmer{K,N} = Kmer{AminoAcidAlphabet,K,N} +const AAKmer{K, N} = Kmer{AminoAcidAlphabet, K, N} "Shorthand for `DNAKmer{3,1}`" -const DNACodon = DNAKmer{3,1} +const DNACodon = DNAKmer{3, 1} "Shorthand for `RNAKmer{3,1}`" -const RNACodon = RNAKmer{3,1} - +const RNACodon = RNAKmer{3, 1} ### ### Base Functions ### -@inline ksize(::Type{Kmer{A,K,N}}) where {A,K,N} = K -@inline nsize(::Type{Kmer{A,K,N}}) where {A,K,N} = N -@inline per_word_capacity(::Type{Kmer{A,K,N}}) where {A,K,N} = div(64, BioSequences.bits_per_symbol(A())) +@inline ksize(::Type{Kmer{A, K, N}}) where {A, K, N} = K +@inline nsize(::Type{Kmer{A, K, N}}) where {A, K, N} = N +@inline per_word_capacity(::Type{Kmer{A, K, N}}) where {A, K, N} = + div(64, BioSequences.bits_per_symbol(A())) @inline per_word_capacity(seq::Kmer) = per_word_capacity(typeof(seq)) -@inline capacity(::Type{Kmer{A,K,N}}) where {A,K,N} = per_word_capacity(Kmer{A,K,N}) * N +@inline capacity(::Type{Kmer{A, K, N}}) where {A, K, N} = + per_word_capacity(Kmer{A, K, N}) * N @inline capacity(seq::Kmer) = capacity(typeof(seq)) -@inline n_unused(::Type{Kmer{A,K,N}}) where {A,K,N} = capacity(Kmer{A,K,N}) - K +@inline n_unused(::Type{Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K @inline n_unused(seq::Kmer) = n_unused(typeof(seq)) -@inline elements_in_head(::Type{Kmer{A,K,N}}) where {A,K,N} = per_word_capacity(Kmer{A,K,N}) - n_unused(Kmer{A,K,N}) +@inline elements_in_head(::Type{Kmer{A, K, N}}) where {A, K, N} = + per_word_capacity(Kmer{A, K, N}) - n_unused(Kmer{A, K, N}) @inline elements_in_head(seq::Kmer) = elements_in_head(typeof(seq)) """ @@ -424,7 +431,7 @@ Because it is used on type parameters / variables, these conditions should be checked at compile time, and the branches / error throws eliminated when the parameterisation of the Kmer type is good. """ -@inline function checkmer(::Type{Kmer{A,K,N}}) where {A,K,N} +@inline function checkmer(::Type{Kmer{A, K, N}}) where {A, K, N} if K < 1 throw(ArgumentError("Bad kmer parameterisation. K must be greater than 0.")) end @@ -436,26 +443,26 @@ parameterisation of the Kmer type is good. end end -@inline Base.length(x::Kmer{A,K,N}) where {A,K,N} = K -@inline Base.summary(x::Kmer{A,K,N}) where {A,K,N} = string(eltype(x), ' ', K, "-mer") +@inline Base.length(x::Kmer{A, K, N}) where {A, K, N} = K +@inline Base.summary(x::Kmer{A, K, N}) where {A, K, N} = string(eltype(x), ' ', K, "-mer") -function Base.typemin(::Type{Kmer{A,K,N}}) where {A,K,N} - return Kmer{A,K,N}(unsafe, ntuple(i -> zero(UInt64), N)) +function Base.typemin(::Type{Kmer{A, K, N}}) where {A, K, N} + return Kmer{A, K, N}(unsafe, ntuple(i -> zero(UInt64), N)) end -function Base.typemax(::Type{Kmer{A,K,N}}) where {A,K,N} - return Kmer{A,K,N}((typemax(UInt64), ntuple(i -> typemax(UInt64), N - 1)...)) +function Base.typemax(::Type{Kmer{A, K, N}}) where {A, K, N} + return Kmer{A, K, N}((typemax(UInt64), ntuple(i -> typemax(UInt64), N - 1)...)) end -@inline function rand_kmer_data(::Type{Kmer{A,K,N}}, ::Val{true}) where {A,K,N} - return Kmer{A,K,N}(ntuple(i -> rand(UInt64), Val{N}())) +@inline function rand_kmer_data(::Type{Kmer{A, K, N}}, ::Val{true}) where {A, K, N} + return Kmer{A, K, N}(ntuple(i -> rand(UInt64), Val{N}())) end -@inline function rand_kmer_data(::Type{Kmer{A,K,N}}, ::Val{false}) where {A,K,N} +@inline function rand_kmer_data(::Type{Kmer{A, K, N}}, ::Val{false}) where {A, K, N} ## All based on alphabet type of Kmer, so should constant fold. bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A,K,N}) - n_per_chunk = per_word_capacity(Kmer{A,K,N}) + n_head = elements_in_head(Kmer{A, K, N}) + n_per_chunk = per_word_capacity(Kmer{A, K, N}) # Construct the head. head = zero(UInt64) @inbounds for i in 1:n_head @@ -475,7 +482,6 @@ end return (head, tail...) end - """ Base.rand(::Type{Kmer{A,K,N}}) where {A,K,N} Base.rand(::Type{Kmer{A,K}}) where {A,K} @@ -490,14 +496,14 @@ ACT ``` """ -@inline function Base.rand(::Type{Kmer{A,K,N}}) where {A,K,N} - checkmer(Kmer{A,K,N}) - return Kmer{A,K,N}(rand_kmer_data(Kmer{A,K,N}, BioSequences.iscomplete(A()))) +@inline function Base.rand(::Type{Kmer{A, K, N}}) where {A, K, N} + checkmer(Kmer{A, K, N}) + return Kmer{A, K, N}(rand_kmer_data(Kmer{A, K, N}, BioSequences.iscomplete(A()))) end -Base.rand(::Type{Kmer{A,K}}) where {A,K} = rand(kmertype(Kmer{A,K})) +Base.rand(::Type{Kmer{A, K}}) where {A, K} = rand(kmertype(Kmer{A, K})) -function Base.rand(::Type{T}, size::Integer) where {T<:Kmer} +function Base.rand(::Type{T}, size::Integer) where {T <: Kmer} return [rand(T) for _ in 1:size] end @@ -519,7 +525,7 @@ include("indexing.jl") #LongSequence{A}(x::Kmer{A,K,N}) where {A,K,N} = LongSequence{A}([nt for nt in x]) # Convenience method so as don't need to specify A in LongSequence{A}. -BioSequences.LongSequence(x::Kmer{A,K,N}) where {A,K,N} = LongSequence{A}(x) +BioSequences.LongSequence(x::Kmer{A, K, N}) where {A, K, N} = LongSequence{A}(x) include("predicates.jl") include("counting.jl") @@ -530,10 +536,14 @@ include("transformations.jl") ### # TODO: Decide on this vs. old iterator pattern. I like the terseness of the code vs defining an iterator. Neither should allocate. -fw_neighbors(kmer::Kmer{A,K,N}) where {A<:DNAAlphabet,K,N} = ntuple(i -> pushlast(kmer, ACGT[i]), Val{4}()) -fw_neighbors(kmer::Kmer{A,K,N}) where {A<:RNAAlphabet,K,N} = ntuple(i -> pushlast(kmer, ACGU[i]), Val{4}()) -bw_neighbors(kmer::Kmer{A,K,N}) where {A<:DNAAlphabet,K,N} = ntuple(i -> pushfirst(kmer, ACGT[i]), Val{4}()) -bw_neighbors(kmer::Kmer{A,K,N}) where {A<:RNAAlphabet,K,N} = ntuple(i -> pushfirst(kmer, ACGU[i]), Val{4}()) +fw_neighbors(kmer::Kmer{A, K, N}) where {A <: DNAAlphabet, K, N} = + ntuple(i -> pushlast(kmer, ACGT[i]), Val{4}()) +fw_neighbors(kmer::Kmer{A, K, N}) where {A <: RNAAlphabet, K, N} = + ntuple(i -> pushlast(kmer, ACGU[i]), Val{4}()) +bw_neighbors(kmer::Kmer{A, K, N}) where {A <: DNAAlphabet, K, N} = + ntuple(i -> pushfirst(kmer, ACGT[i]), Val{4}()) +bw_neighbors(kmer::Kmer{A, K, N}) where {A <: RNAAlphabet, K, N} = + ntuple(i -> pushfirst(kmer, ACGU[i]), Val{4}()) #= # Neighbors on a de Bruijn graph @@ -573,7 +583,7 @@ macro mer_str(seq, flag) elseif flag == "rna" || flag == "r" T = kmertype(RNAKmer{length(seq′)}) return T(seq′) - elseif flag == "aa" || flag == "a" || flag == "prot" || flag == "p" + elseif flag == "aa" || flag == "a" || flag == "prot" || flag == "p" T = kmertype(AAKmer{length(seq′)}) return T(seq′) else @@ -587,4 +597,4 @@ macro mer_str(seq) return T(seq′) end -include("revtrans.jl") \ No newline at end of file +include("revtrans.jl") diff --git a/src/kmer_iteration/AbstractKmerIterator.jl b/src/kmer_iteration/AbstractKmerIterator.jl index f320476..4e20420 100644 --- a/src/kmer_iteration/AbstractKmerIterator.jl +++ b/src/kmer_iteration/AbstractKmerIterator.jl @@ -8,29 +8,44 @@ ### Type for storing the result of Kmer iteration. -abstract type AbstractKmerIterator{T<:Kmer,S<:BioSequence} end +abstract type AbstractKmerIterator{T <: Kmer, S <: BioSequence} end -@inline Base.eltype(::Type{<:AbstractKmerIterator{T,S}}) where {T,S} = Tuple{UInt64,T} +@inline Base.eltype(::Type{<:AbstractKmerIterator{T, S}}) where {T, S} = Tuple{UInt64, T} -@inline Base.IteratorSize(::Type{<:AbstractKmerIterator{Kmer{A,K,N},S}}) where {A,S<:BioSequence{A},K,N} = Base.HasLength() -@inline Base.IteratorSize(::Type{<:AbstractKmerIterator{Kmer{A,K,N},S}}) where {A,B,S<:BioSequence{B},K,N} = Base.SizeUnknown() +@inline Base.IteratorSize( + ::Type{<:AbstractKmerIterator{Kmer{A, K, N}, S}}, +) where {A, S <: BioSequence{A}, K, N} = Base.HasLength() +@inline Base.IteratorSize( + ::Type{<:AbstractKmerIterator{Kmer{A, K, N}, S}}, +) where {A, B, S <: BioSequence{B}, K, N} = Base.SizeUnknown() -@inline function Base.length(it::AbstractKmerIterator{Kmer{A,K,N},S}) where {A,K,N,S<:BioSequence{A}} +@inline function Base.length( + it::AbstractKmerIterator{Kmer{A, K, N}, S}, +) where {A, K, N, S <: BioSequence{A}} return max(0, fld(it.stop - it.start + 1 - K, step(it)) + 1) end # Iteration where the Kmer and Seq alphabets match: ## Initial iteration without state. -@inline function Base.iterate(it::AbstractKmerIterator{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - fwkmer = _build_kmer_data(Kmer{A,K,N}, it.seq, 1) +@inline function Base.iterate( + it::AbstractKmerIterator{Kmer{A, K, N}, LongSequence{A}}, +) where {A, K, N} + fwkmer = _build_kmer_data(Kmer{A, K, N}, it.seq, 1) if isnothing(fwkmer) return nothing else # Get the reverse. - alph = Alphabet(Kmer{A,K,N}) - rshift = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(alph) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry(_reverse(BioSequences.BitsPerSymbol(alph), _complement_bitpar(alph, fwkmer...)...), rshift) - return KmerAt{Kmer{A,K,N}}(1, Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer)), (K, fwkmer, rvkmer) + alph = Alphabet(Kmer{A, K, N}) + rshift = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(alph) # Based on alphabet type, should constant fold. + rvkmer = rightshift_carry( + _reverse( + BioSequences.BitsPerSymbol(alph), + _complement_bitpar(alph, fwkmer...)..., + ), + rshift, + ) + return KmerAt{Kmer{A, K, N}}(1, Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer)), + (K, fwkmer, rvkmer) end -end \ No newline at end of file +end diff --git a/src/kmer_iteration/EveryCanonicalKmer.jl b/src/kmer_iteration/EveryCanonicalKmer.jl index 21e883b..321437e 100644 --- a/src/kmer_iteration/EveryCanonicalKmer.jl +++ b/src/kmer_iteration/EveryCanonicalKmer.jl @@ -20,15 +20,20 @@ An iterator over every canonical valid overlapping `T<:Kmer` in a given longer As a result, the overlap between successive kmers may not reliably be K - 1, and the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. """ -struct EveryCanonicalKmer{T<:Kmer,S<:BioSequence{<:NucleicAcidAlphabet}} <: AbstractKmerIterator{T,S} +struct EveryCanonicalKmer{T <: Kmer, S <: BioSequence{<:NucleicAcidAlphabet}} <: + AbstractKmerIterator{T, S} seq::S start::Int stop::Int - - function EveryCanonicalKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} + + function EveryCanonicalKmer{T, S}( + seq::S, + start::Int=firstindex(seq), + stop::Int=lastindex(seq), + ) where {T <: Kmer, S <: BioSequence} T′ = kmertype(T) checkmer(T′) # Should inline and constant fold. - return new{T′,S}(seq, start, stop) + return new{T′, S}(seq, start, stop) end end @@ -39,8 +44,12 @@ Convenience outer constructor so you don't have to specify `S` along with `T`. E.g. Instead of `EveryCanonicalKmer{DNACodon,typeof(s)}(s)`, you can just use `EveryCanonicalKmer{DNACodon}(s)` """ -function EveryCanonicalKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - return EveryCanonicalKmer{T,S}(seq, start, stop) +function EveryCanonicalKmer{T}( + seq::S, + start=firstindex(seq), + stop=lastindex(seq), +) where {T <: Kmer, S <: BioSequence} + return EveryCanonicalKmer{T, S}(seq, start, stop) end """ @@ -54,56 +63,74 @@ taken from `::Val{K}`, and `N` is deduced using `A` and `K`. E.g. Instead of `EveryCanonicalKmer{DNAKmer{3,1}}(s)`, or `EveryCanonicalKmer{DNACodon}(s)`, you can use `EveryCanonicalKmer(s, Val(3))` """ -function EveryCanonicalKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - return EveryCanonicalKmer{Kmer{A,K}}(seq, start, stop) +function EveryCanonicalKmer( + seq::BioSequence{A}, + ::Val{K}, + start=firstindex(seq), + stop=lastindex(seq), +) where {A, K} + return EveryCanonicalKmer{Kmer{A, K}}(seq, start, stop) end Base.step(x::EveryCanonicalKmer) = 1 - - ## Initial iteration without state. -@inline function Base.iterate(it::EveryCanonicalKmer{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - fwkmer = _build_kmer_data(Kmer{A,K,N}, it.seq, it.start) +@inline function Base.iterate( + it::EveryCanonicalKmer{Kmer{A, K, N}, LongSequence{A}}, +) where {A, K, N} + fwkmer = _build_kmer_data(Kmer{A, K, N}, it.seq, it.start) if isnothing(fwkmer) return nothing else - rshift = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry(_reverse(BioSequences.BitsPerSymbol(A()), _complement_bitpar(A(), fwkmer...)...), rshift) - return (it.start, Kmer{A,K,N}(min(fwkmer, rvkmer))), (it.start + K - 1, fwkmer, rvkmer) + rshift = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. + rvkmer = rightshift_carry( + _reverse( + BioSequences.BitsPerSymbol(A()), + _complement_bitpar(A(), fwkmer...)..., + ), + rshift, + ) + return (it.start, Kmer{A, K, N}(min(fwkmer, rvkmer))), + (it.start + K - 1, fwkmer, rvkmer) end end -@inline function Base.iterate(it::EveryCanonicalKmer{Kmer{A,K,N},LongSequence{A}}, state) where {A,K,N} +@inline function Base.iterate( + it::EveryCanonicalKmer{Kmer{A, K, N}, LongSequence{A}}, + state, +) where {A, K, N} i, fwkmer, rvkmer = state i += 1 if i > it.stop return nothing else bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - rshift = (64 - (n_unused(Kmer{A,K,N}) + 1) * bps) # Based on type info, should constant fold. + rshift = (64 - (n_unused(Kmer{A, K, N}) + 1) * bps) # Based on type info, should constant fold. mask = (one(UInt64) << bps) - one(UInt64) # Based on type info, should constant fold. - + fbits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) rbits = (BioSequences.complement_bitpar(fbits, A()) & mask) << rshift fwkmer = leftshift_carry(fwkmer, bps, fbits) rvkmer = rightshift_carry(rvkmer, bps, rbits) pos = i - K + 1 - return (pos, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), (i, fwkmer, rvkmer) + return (pos, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), (i, fwkmer, rvkmer) end end -@inline Base.IteratorSize(::Type{<:EveryCanonicalKmer{Kmer{A,N,K},LongSequence{B}}}) where {A<:NucleicAcidAlphabet{2},N,K,B<:NucleicAcidAlphabet{4}} = Base.SizeUnknown() +@inline Base.IteratorSize( + ::Type{<:EveryCanonicalKmer{Kmer{A, N, K}, LongSequence{B}}}, +) where {A <: NucleicAcidAlphabet{2}, N, K, B <: NucleicAcidAlphabet{4}} = + Base.SizeUnknown() -@inline function Base.iterate(it::EveryCanonicalKmer{Kmer{A,K,N},LongSequence{B}}, - state = (it.start - 1, 1, blank_ntuple(Kmer{A,K,N}), blank_ntuple(Kmer{A,K,N})) - ) where {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4},K,N} - +@inline function Base.iterate( + it::EveryCanonicalKmer{Kmer{A, K, N}, LongSequence{B}}, + state=(it.start - 1, 1, blank_ntuple(Kmer{A, K, N}), blank_ntuple(Kmer{A, K, N})), +) where {A <: NucleicAcidAlphabet{2}, B <: NucleicAcidAlphabet{4}, K, N} i, filled, fwkmer, rvkmer = state i += 1 filled -= 1 - rshift = (64 - (n_unused(Kmer{A,K,N}) + 1) * 2) # Based on type info, should constant fold. + rshift = (64 - (n_unused(Kmer{A, K, N}) + 1) * 2) # Based on type info, should constant fold. mask = (one(UInt64) << 2) - one(UInt64) # Based on type info, should constant fold. while i ≤ it.stop @@ -114,9 +141,10 @@ end rvkmer = rightshift_carry(rvkmer, 2, rbits) filled = ifelse(fbits == UInt64(0xff), 0, filled + 1) if filled == K - return (i - K + 1, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), (i, filled, fwkmer, rvkmer) + return (i - K + 1, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), + (i, filled, fwkmer, rvkmer) end i += 1 end return nothing -end \ No newline at end of file +end diff --git a/src/kmer_iteration/EveryKmer.jl b/src/kmer_iteration/EveryKmer.jl index 7885afb..115b41d 100644 --- a/src/kmer_iteration/EveryKmer.jl +++ b/src/kmer_iteration/EveryKmer.jl @@ -28,15 +28,19 @@ An iterator over every valid overlapping `T<:Kmer` in a given longer As a result, the overlap between successive kmers may not reliably be K - 1, and the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. """ -struct EveryKmer{T<:Kmer,S<:BioSequence} <: AbstractKmerIterator{T,S} +struct EveryKmer{T <: Kmer, S <: BioSequence} <: AbstractKmerIterator{T, S} seq::S start::Int stop::Int - - function EveryKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} + + function EveryKmer{T, S}( + seq::S, + start::Int=firstindex(seq), + stop::Int=lastindex(seq), + ) where {T <: Kmer, S <: BioSequence} T′ = kmertype(T) checkmer(T′) # Should inline and constant fold. - return new{T′,S}(seq, start, stop) + return new{T′, S}(seq, start, stop) end end @@ -47,8 +51,12 @@ Convenience outer constructor so you don't have to specify `S` along with `T`. E.g. Instead of `EveryKmer{DNACodon,typeof(s)}(s)`, you can just use `EveryKmer{DNACodon}(s)` """ -function EveryKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - return EveryKmer{T,S}(seq, start, stop) +function EveryKmer{T}( + seq::S, + start=firstindex(seq), + stop=lastindex(seq), +) where {T <: Kmer, S <: BioSequence} + return EveryKmer{T, S}(seq, start, stop) end """ @@ -62,23 +70,31 @@ taken from `::Val{K}`, and `N` is deduced using `A` and `K`. E.g. Instead of `EveryKmer{DNAKmer{3,1}}(s)`, or `EveryKmer{DNACodon}(s)`, you can use `EveryKmer(s, Val(3))` """ -function EveryKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - return EveryKmer{Kmer{A,K}}(seq, start, stop) +function EveryKmer( + seq::BioSequence{A}, + ::Val{K}, + start=firstindex(seq), + stop=lastindex(seq), +) where {A, K} + return EveryKmer{Kmer{A, K}}(seq, start, stop) end Base.step(x::EveryKmer) = 1 ## Initial iteration without state. -@inline function Base.iterate(it::EveryKmer{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - kmer = _build_kmer_data(Kmer{A,K,N}, it.seq, 1) +@inline function Base.iterate(it::EveryKmer{Kmer{A, K, N}, LongSequence{A}}) where {A, K, N} + kmer = _build_kmer_data(Kmer{A, K, N}, it.seq, 1) if isnothing(kmer) return nothing else - return (1, Kmer{A,K,N}(kmer)), (K, kmer) + return (1, Kmer{A, K, N}(kmer)), (K, kmer) end end -@inline function Base.iterate(it::EveryKmer{Kmer{A,K,N},LongSequence{A}}, state) where {A,K,N} +@inline function Base.iterate( + it::EveryKmer{Kmer{A, K, N}, LongSequence{A}}, + state, +) where {A, K, N} i, fwkmer = state i += 1 if i > it.stop @@ -88,24 +104,52 @@ end bits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) kmer = leftshift_carry(fwkmer, bps, bits) pos = i - K + 1 - return (pos, Kmer{A,K,N}(kmer)), (i, kmer) + return (pos, Kmer{A, K, N}(kmer)), (i, kmer) end end ## Special case where iterating over 2-Bit encoded kmers in a 4-Bit encoded sequence, ## behaviour is to produce kmers by skipping over the ambiguous sites. -const kmerbits = (UInt64(0xff), UInt64(0x00), UInt64(0x01), UInt64(0xff), - UInt64(0x02), UInt64(0xff), UInt64(0xff), UInt64(0xff), - UInt64(0x03), UInt64(0xff), UInt64(0xff), UInt64(0xff), - UInt64(0xff), UInt64(0xff), UInt64(0xff), UInt64(0xff)) - -@inline Base.IteratorSize(::Type{<:EveryKmer{Kmer{A,N,K},S}}) where {A<:NucleicAcidAlphabet{2},N,K,B<:NucleicAcidAlphabet{4},S<:BioSequence{B}} = Base.SizeUnknown() - -@inline function Base.iterate(it::EveryKmer{Kmer{A,K,N},S}, - state = (it.start - 1, 1, blank_ntuple(Kmer{A,K,N})) - ) where {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4},S<:BioSequence{B},K,N} - +const kmerbits = ( + UInt64(0xff), + UInt64(0x00), + UInt64(0x01), + UInt64(0xff), + UInt64(0x02), + UInt64(0xff), + UInt64(0xff), + UInt64(0xff), + UInt64(0x03), + UInt64(0xff), + UInt64(0xff), + UInt64(0xff), + UInt64(0xff), + UInt64(0xff), + UInt64(0xff), + UInt64(0xff), +) + +@inline Base.IteratorSize( + ::Type{<:EveryKmer{Kmer{A, N, K}, S}}, +) where { + A <: NucleicAcidAlphabet{2}, + N, + K, + B <: NucleicAcidAlphabet{4}, + S <: BioSequence{B}, +} = Base.SizeUnknown() + +@inline function Base.iterate( + it::EveryKmer{Kmer{A, K, N}, S}, + state=(it.start - 1, 1, blank_ntuple(Kmer{A, K, N})), +) where { + A <: NucleicAcidAlphabet{2}, + B <: NucleicAcidAlphabet{4}, + S <: BioSequence{B}, + K, + N, +} i, filled, fwkmer = state i += 1 filled -= 1 @@ -116,9 +160,9 @@ const kmerbits = (UInt64(0xff), UInt64(0x00), UInt64(0x01), UInt64(0xff), fwkmer = leftshift_carry(fwkmer, 2, fbits) filled = ifelse(fbits == UInt64(0xff), 0, filled + 1) if filled == K - return (i - K + 1, Kmer{A,K,N}(fwkmer)), (i, filled, fwkmer) + return (i - K + 1, Kmer{A, K, N}(fwkmer)), (i, filled, fwkmer) end i += 1 end return nothing -end \ No newline at end of file +end diff --git a/src/kmer_iteration/SpacedCanonicalKmers.jl b/src/kmer_iteration/SpacedCanonicalKmers.jl index c650cf9..9c855c4 100644 --- a/src/kmer_iteration/SpacedCanonicalKmers.jl +++ b/src/kmer_iteration/SpacedCanonicalKmers.jl @@ -22,15 +22,20 @@ longer `BioSequence`, between a `start` and `stop` position. reading frame will be preserved. In addition, the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. """ -struct SpacedCanonicalKmers{T<:Kmer,S<:BioSequence} <: AbstractKmerIterator{T,S} +struct SpacedCanonicalKmers{T <: Kmer, S <: BioSequence} <: AbstractKmerIterator{T, S} seq::S start::Int step::Int stop::Int filled::Int # This is cached for speed increment::Int # This is cached for speed - - function SpacedCanonicalKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} + + function SpacedCanonicalKmers{T, S}( + seq::S, + step::Int, + start::Int, + stop::Int, + ) where {T <: Kmer, S <: BioSequence} T′ = kmertype(T) checkmer(T′) # Should inline and constant fold. if step <= 1 @@ -38,7 +43,7 @@ struct SpacedCanonicalKmers{T<:Kmer,S<:BioSequence} <: AbstractKmerIterator{T,S} end filled = max(0, ksize(T′) - step) increment = max(1, step - ksize(T′) + 1) - return new{T′,S}(seq, start, step, stop, filled, increment) + return new{T′, S}(seq, start, step, stop, filled, increment) end end @@ -49,8 +54,13 @@ Convenience outer constructor so you don't have to specify `S` along with `T`. E.g. Instead of `SpacedCanonicalKmers{DNACodon,typeof(s)}(s, 3)`, you can just use `SpacedCanonicalKmers{DNACodon}(s, 3)` """ -function SpacedCanonicalKmers{T}(seq::S, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - return SpacedCanonicalKmers{T,S}(seq, step, start, stop) +function SpacedCanonicalKmers{T}( + seq::S, + step::Int, + start=firstindex(seq), + stop=lastindex(seq), +) where {T <: Kmer, S <: BioSequence} + return SpacedCanonicalKmers{T, S}(seq, step, start, stop) end """ @@ -64,34 +74,51 @@ taken from `::Val{K}`, and `N` is deduced using `A` and `K`. E.g. Instead of `SpacedCanonicalKmers{DNAKmer{3,1}}(s, 3)`, or `SpacedCanonicalKmers{DNACodon}(s, 3)`, you can use `SpacedCanonicalKmers(s, Val(3), 3)` """ -function SpacedCanonicalKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - return SpacedCanonicalKmers{Kmer{A,K}}(seq, step, start, stop) +function SpacedCanonicalKmers( + seq::BioSequence{A}, + ::Val{K}, + step::Int, + start=firstindex(seq), + stop=lastindex(seq), +) where {A, K} + return SpacedCanonicalKmers{Kmer{A, K}}(seq, step, start, stop) end Base.step(x::SpacedCanonicalKmers) = x.step -@inline function Base.iterate(it::SpacedCanonicalKmers{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - fwkmer = _build_kmer_data(Kmer{A,K,N}, it.seq, 1) +@inline function Base.iterate( + it::SpacedCanonicalKmers{Kmer{A, K, N}, LongSequence{A}}, +) where {A, K, N} + fwkmer = _build_kmer_data(Kmer{A, K, N}, it.seq, 1) if isnothing(fwkmer) return nothing else - rshift = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry(_reverse(BioSequences.BitsPerSymbol(A()), _complement_bitpar(A(), fwkmer...)...), rshift) - return (1, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), (K, fwkmer, rvkmer) + rshift = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. + rvkmer = rightshift_carry( + _reverse( + BioSequences.BitsPerSymbol(A()), + _complement_bitpar(A(), fwkmer...)..., + ), + rshift, + ) + return (1, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), (K, fwkmer, rvkmer) end end -@inline function Base.iterate(it::SpacedCanonicalKmers{Kmer{A,K,N},LongSequence{A}}, state) where {A,K,N} +@inline function Base.iterate( + it::SpacedCanonicalKmers{Kmer{A, K, N}, LongSequence{A}}, + state, +) where {A, K, N} i, fwkmer, rvkmer = state filled = it.filled i += it.increment - - for _ in filled:K-1 + + for _ in filled:(K - 1) if i > it.stop return nothing else bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - rshift = (64 - (n_unused(Kmer{A,K,N}) + 1) * bps) # Based on type info, should constant fold. + rshift = (64 - (n_unused(Kmer{A, K, N}) + 1) * bps) # Based on type info, should constant fold. mask = (one(UInt64) << bps) - one(UInt64) # Based on type info, should constant fold. fbits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) rbits = (BioSequences.complement_bitpar(fbits, A()) & mask) << rshift @@ -101,11 +128,19 @@ end end end pos = i - K + 1 - return (pos, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), (i, fwkmer, rvkmer) + return (pos, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), (i, fwkmer, rvkmer) end -@inline function Base.iterate(it::SpacedCanonicalKmers{Kmer{A,K,N},LongSequence{B}}, state = (it.start - it.increment, 1, 0, blank_ntuple(Kmer{A,K,N}), blank_ntuple(Kmer{A,K,N})) - ) where {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4},K,N} +@inline function Base.iterate( + it::SpacedCanonicalKmers{Kmer{A, K, N}, LongSequence{B}}, + state=( + it.start - it.increment, + 1, + 0, + blank_ntuple(Kmer{A, K, N}), + blank_ntuple(Kmer{A, K, N}), + ), +) where {A <: NucleicAcidAlphabet{2}, B <: NucleicAcidAlphabet{4}, K, N} i, pos, filled, fwkmer, rvkmer = state i += it.increment @@ -125,9 +160,9 @@ end end if filled == K state = (i, i - K + 1 + it.step, it.filled, fwkmer, rvkmer) - return (pos, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), state + return (pos, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), state end i += 1 end return nothing -end \ No newline at end of file +end diff --git a/src/kmer_iteration/SpacedKmers.jl b/src/kmer_iteration/SpacedKmers.jl index 18461ee..fd5abbf 100644 --- a/src/kmer_iteration/SpacedKmers.jl +++ b/src/kmer_iteration/SpacedKmers.jl @@ -22,15 +22,20 @@ longer `BioSequence`, between a `start` and `stop` position. reading frame will be preserved. In addition, the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. """ -struct SpacedKmers{T<:Kmer,S<:BioSequence} <: AbstractKmerIterator{T,S} +struct SpacedKmers{T <: Kmer, S <: BioSequence} <: AbstractKmerIterator{T, S} seq::S start::Int step::Int stop::Int filled::Int # This is cached for speed increment::Int # This is cached for speed - - function SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} + + function SpacedKmers{T, S}( + seq::S, + step::Int, + start::Int, + stop::Int, + ) where {T <: Kmer, S <: BioSequence} T′ = kmertype(T) checkmer(T′) # Should inline and constant fold. if step <= 1 @@ -38,7 +43,7 @@ struct SpacedKmers{T<:Kmer,S<:BioSequence} <: AbstractKmerIterator{T,S} end filled = max(0, ksize(T′) - step) increment = max(1, step - ksize(T′) + 1) - return new{T′,S}(seq, start, step, stop, filled, increment) + return new{T′, S}(seq, start, step, stop, filled, increment) end end @@ -49,8 +54,13 @@ Convenience outer constructor so you don't have to specify `S` along with `T`. E.g. Instead of `SpacedKmers{DNACodon,typeof(s)}(s, 3)`, you can just use `SpacedKmers{DNACodon}(s, 3)` """ -function SpacedKmers{T}(seq::S, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - return SpacedKmers{T,S}(seq, step, start, stop) +function SpacedKmers{T}( + seq::S, + step::Int, + start=firstindex(seq), + stop=lastindex(seq), +) where {T <: Kmer, S <: BioSequence} + return SpacedKmers{T, S}(seq, step, start, stop) end """ @@ -64,29 +74,40 @@ taken from `::Val{K}`, and `N` is deduced using `A` and `K`. E.g. Instead of `SpacedKmers{DNAKmer{3,1}}(s, 3)`, or `SpacedKmers{DNACodon}(s, 3)`, you can use `SpacedKmers(s, Val(3), 3)` """ -function SpacedKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - return SpacedKmers{Kmer{A,K}}(seq, step, start, stop) +function SpacedKmers( + seq::BioSequence{A}, + ::Val{K}, + step::Int, + start=firstindex(seq), + stop=lastindex(seq), +) where {A, K} + return SpacedKmers{Kmer{A, K}}(seq, step, start, stop) end Base.step(x::SpacedKmers) = x.step -@inline function Base.iterate(it::SpacedKmers{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - kmer = _build_kmer_data(Kmer{A,K,N}, it.seq, 1) +@inline function Base.iterate( + it::SpacedKmers{Kmer{A, K, N}, LongSequence{A}}, +) where {A, K, N} + kmer = _build_kmer_data(Kmer{A, K, N}, it.seq, 1) if isnothing(kmer) return nothing else # Get the reverse. - alph = Alphabet(Kmer{A,K,N}) - return (1, Kmer{A,K,N}(kmer)), (K, kmer) + alph = Alphabet(Kmer{A, K, N}) + return (1, Kmer{A, K, N}(kmer)), (K, kmer) end end -@inline function Base.iterate(it::SpacedKmers{Kmer{A,K,N},LongSequence{A}}, state) where {A,K,N} +@inline function Base.iterate( + it::SpacedKmers{Kmer{A, K, N}, LongSequence{A}}, + state, +) where {A, K, N} i, kmer = state filled = it.filled i += it.increment - - for _ in filled:K-1 + + for _ in filled:(K - 1) if i > it.stop return nothing else @@ -97,11 +118,13 @@ end end end pos = i - K + 1 - return (pos, Kmer{A,K,N}(kmer)), (i, kmer) + return (pos, Kmer{A, K, N}(kmer)), (i, kmer) end -@inline function Base.iterate(it::SpacedKmers{Kmer{A,K,N},LongSequence{B}}, state = (it.start - it.increment, 1, 0, blank_ntuple(Kmer{A,K,N})) - ) where {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4},K,N} +@inline function Base.iterate( + it::SpacedKmers{Kmer{A, K, N}, LongSequence{B}}, + state=(it.start - it.increment, 1, 0, blank_ntuple(Kmer{A, K, N})), +) where {A <: NucleicAcidAlphabet{2}, B <: NucleicAcidAlphabet{4}, K, N} i, pos, filled, kmer = state i += it.increment @@ -119,7 +142,7 @@ end end if filled == K state = (i, i - K + 1 + it.step, it.filled, kmer) - return (pos, Kmer{A,K,N}(kmer)), state + return (pos, Kmer{A, K, N}(kmer)), state end i += 1 end diff --git a/src/predicates.jl b/src/predicates.jl index 86258e6..5a79f3c 100644 --- a/src/predicates.jl +++ b/src/predicates.jl @@ -2,10 +2,10 @@ ### Mer specific specializations of src/biosequence/predicates.jl ### -Base.cmp(x::T, y::T) where {T<:Kmer} = cmp(x.data, y.data) -Base.:(==)(x::T, y::T) where {T<:Kmer} = x.data == y.data -Base.isless(x::T, y::T) where {T<:Kmer} = isless(x.data, y.data) +Base.cmp(x::T, y::T) where {T <: Kmer} = cmp(x.data, y.data) +Base.:(==)(x::T, y::T) where {T <: Kmer} = x.data == y.data +Base.isless(x::T, y::T) where {T <: Kmer} = isless(x.data, y.data) # TODO: Ensure this is the right way to go. # See https://github.com/BioJulia/BioSequences.jl/pull/121#discussion_r475234270 -Base.hash(x::Kmer{A,K,N}, h::UInt) where {A,K,N} = hash(x.data, h ⊻ K) \ No newline at end of file +Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) diff --git a/src/revtrans.jl b/src/revtrans.jl index e4ebedf..7cb021b 100644 --- a/src/revtrans.jl +++ b/src/revtrans.jl @@ -33,11 +33,11 @@ struct CodonSet <: AbstractSet{RNACodon} CodonSet(x::UInt64, ::Unsafe) = new(x) end CodonSet() = CodonSet(UInt64(0), Unsafe()) -CodonSet(itr) = foldl(push, itr, init=CodonSet()) +CodonSet(itr) = foldl(push, itr; init=CodonSet()) function Base.iterate(x::CodonSet, s::UInt64=x.x) codon = RNACodon((trailing_zeros(s) % UInt64,)) - iszero(s) ? nothing : (codon, s & (s-1)) + iszero(s) ? nothing : (codon, s & (s - 1)) end function push(s::CodonSet, x::RNACodon) @@ -52,8 +52,8 @@ Base.filter(f, s::CodonSet) = CodonSet(Iterators.filter(f, s)) Base.setdiff(a::CodonSet, b::Vararg{CodonSet}) = CodonSet(a.x & ~(union(b...).x), Unsafe()) for (name, f) in [(:union, |), (:intersect, &), (:symdiff, ⊻)] - @eval function Base.$(name)(a::CodonSet, b::Vararg{CodonSet}) - CodonSet(mapreduce(i -> i.x, $f, b, init=a.x), Unsafe()) + @eval function Base.$(name)(a::CodonSet, b::Vararg{CodonSet}) + CodonSet(mapreduce(i -> i.x, $f, b; init=a.x), Unsafe()) end end @@ -89,13 +89,13 @@ See also: [`reverse_translate`](@ref) """ struct ReverseGeneticCode <: AbstractDict{AminoAcid, CodonSet} name::String - sets::NTuple{N_AA-1, CodonSet} + sets::NTuple{N_AA - 1, CodonSet} end function ReverseGeneticCode(x::BioSequences.GeneticCode) ind(aa::AminoAcid) = reinterpret(UInt8, aa) + 1 - sets = fill(CodonSet(), N_AA-1) + sets = fill(CodonSet(), N_AA - 1) x_set = CodonSet() for i in Int64(0):Int64(63) aa = x.tbl[i + 1] @@ -122,9 +122,7 @@ function ReverseGeneticCode(x::BioSequences.GeneticCode) ReverseGeneticCode(x.name, Tuple(sets)) end -const rev_standard_genetic_code = ReverseGeneticCode( - BioSequences.standard_genetic_code -) +const rev_standard_genetic_code = ReverseGeneticCode(BioSequences.standard_genetic_code) function Base.getindex(s::ReverseGeneticCode, a::AminoAcid) if reinterpret(UInt8, a) > (N_AA - 2) # cannot translate gap @@ -136,7 +134,7 @@ end Base.length(c::ReverseGeneticCode) = length(c.sets) function Base.iterate(c::ReverseGeneticCode, s=1) s > length(c.sets) && return nothing - return (reinterpret(AminoAcid, (s-1)%UInt8) => c.sets[s], s+1) + return (reinterpret(AminoAcid, (s - 1) % UInt8) => c.sets[s], s + 1) end """ @@ -146,11 +144,7 @@ Reverse-translates `s` under the reverse genetic code `code`, putting the result See also: [`reverse_translate`](@ref) """ -function reverse_translate!( - v::Vector{CodonSet}, - seq::AASeq, - code=rev_standard_genetic_code -) +function reverse_translate!(v::Vector{CodonSet}, seq::AASeq, code=rev_standard_genetic_code) resize!(v, length(seq)) @inbounds for i in eachindex(v) v[i] = code[seq[i]] diff --git a/src/transformations.jl b/src/transformations.jl index 6aa3456..6f898c2 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -1,25 +1,28 @@ # Bit-parallel element nucleotide complementation -@inline function _complement_bitpar(a::A, head::UInt64, tail...) where {A<:NucleicAcidAlphabet} +@inline function _complement_bitpar( + a::A, + head::UInt64, + tail..., +) where {A <: NucleicAcidAlphabet} return (BioSequences.complement_bitpar(head, A()), _complement_bitpar(a, tail...)...) end -@inline _complement_bitpar(a::A) where {A<:NucleicAcidAlphabet} = () +@inline _complement_bitpar(a::A) where {A <: NucleicAcidAlphabet} = () -@inline function pushfirst(x::Kmer{A,K,N}, nt) where {A,K,N} +@inline function pushfirst(x::Kmer{A, K, N}, nt) where {A, K, N} ntbits = UInt64(BioSequences.encode(A(), nt)) << (62 - (64N - 2K)) #ntbits = UInt64(@inbounds BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) << (62 - (64N - 2K)) - return Kmer{A,K,N}(_rightshift_carry(2, ntbits, x.data...)) + return Kmer{A, K, N}(_rightshift_carry(2, ntbits, x.data...)) end -@inline function pushlast(x::Kmer{A,K,N}, nt) where {A,K,N} +@inline function pushlast(x::Kmer{A, K, N}, nt) where {A, K, N} ntbits = UInt64(BioSequences.encode(A(), nt)) #ntbits = UInt64(@inbounds BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) _, newbits = _leftshift_carry(2, ntbits, x.data...) - return Kmer{A,K,N}(newbits) + return Kmer{A, K, N}(newbits) end - ### ### Transformation methods ### @@ -37,7 +40,7 @@ DNA 5-mer: AATCG ``` """ -@inline function BioSequences.complement(seq::T) where {T<:Kmer} +@inline function BioSequences.complement(seq::T) where {T <: Kmer} return T(_complement_bitpar(Alphabet(seq), seq.data...)) end @@ -54,11 +57,11 @@ DNA 5-mer: CGATT ``` """ -@inline function Base.reverse(seq::Kmer{A,K,N}) where {A,K,N} +@inline function Base.reverse(seq::Kmer{A, K, N}) where {A, K, N} rdata = _reverse(BioSequences.BitsPerSymbol(seq), seq.data...) - # rshift should constant-fold. - rshift = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(A()) - return Kmer{A,K,N}(rightshift_carry(rdata, rshift)) # based on only 2 bit alphabet. + # rshift should constant-fold. + rshift = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) + return Kmer{A, K, N}(rightshift_carry(rdata, rshift)) # based on only 2 bit alphabet. end """ @@ -74,7 +77,7 @@ DNA 5-mer: GCTAA ``` """ -@inline function BioSequences.reverse_complement(seq::Kmer{A,K,N}) where {A,K,N} +@inline function BioSequences.reverse_complement(seq::Kmer{A, K, N}) where {A, K, N} return complement(reverse(seq)) end @@ -103,12 +106,12 @@ DNA 5-mer: GCTAA ``` """ -@inline function BioSequences.canonical(seq::Kmer{A,K,N}) where {A,K,N} +@inline function BioSequences.canonical(seq::Kmer{A, K, N}) where {A, K, N} if N < 4 - return min(seq, reverse_complement(seq)) - else - return iscanonical(seq) ? seq : reverse_complement(seq) - end + return min(seq, reverse_complement(seq)) + else + return iscanonical(seq) ? seq : reverse_complement(seq) + end end ### @@ -126,8 +129,6 @@ function swap(x::T, i, j) where {T<:AbstractMer} return T(b ⊻ ((x << i) | (x << j))) end - - function Random.shuffle(x::T) where {T<:AbstractMer} # Fisher-Yates shuffle for mers. j = lastindex(x) @@ -141,14 +142,14 @@ end throw_translate_err(K) = error("Cannot translate Kmer of size $K not divisible by 3") -@inline function setup_translate(seq::Kmer{<:NucleicAcidAlphabet, K}) where K +@inline function setup_translate(seq::Kmer{<:NucleicAcidAlphabet, K}) where {K} naa, rem = divrem(K, 3) iszero(rem) || throw_translate_err(K) kmertype(AAKmer{naa}) end # This sets the first amino acid to methionine, returning the data tuple -@inline function set_methionine_data(data::Tuple{Vararg{UInt64}}, ::Val{K}) where K +@inline function set_methionine_data(data::Tuple{Vararg{UInt64}}, ::Val{K}) where {K} offset = ((K - 1) * 8) & 63 mask = ~(UInt64(0xff) << offset) # mask off existing AA in pos 1 addition = UInt64(0x0c) << offset # 0x0c is encoded methionine @@ -160,15 +161,15 @@ end function BioSequences.translate( seq::Union{RNAKmer, DNAKmer}; code=BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool = true, # a noop for this method - alternative_start::Bool = false -) + allow_ambiguous_codons::Bool=true, # a noop for this method + alternative_start::Bool=false, +) T = setup_translate(seq) data = blank_ntuple(T) for i in 1:ksize(T) - a = seq[3*i - 2] - b = seq[3*i - 1] - c = seq[3*i - 0] + a = seq[3 * i - 2] + b = seq[3 * i - 1] + c = seq[3 * i - 0] codon = BioSequences.unambiguous_codon(a, b, c) aa = code[codon] # Next line is equivalent to encode, but without checking. @@ -189,28 +190,31 @@ end function BioSequences.translate( seq::Kmer{<:NucleicAcidAlphabet}; code=BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool = true, - alternative_start::Bool = false -) + allow_ambiguous_codons::Bool=true, + alternative_start::Bool=false, +) T = setup_translate(seq) data = blank_ntuple(T) for i in 1:ksize(T) - a = reinterpret(RNA, seq[3*i - 2]) - b = reinterpret(RNA, seq[3*i - 1]) - c = reinterpret(RNA, seq[3*i - 0]) - aa = if BioSequences.isambiguous(a) | BioSequences.isambiguous(b) | BioSequences.isambiguous(c) - aa_ = BioSequences.try_translate_ambiguous_codon(code, a, b, c) - if aa_ === nothing - if allow_ambiguous_codons - aa_ = AA_X - else - error("codon ", a, b, c, " cannot be unambiguously translated") + a = reinterpret(RNA, seq[3 * i - 2]) + b = reinterpret(RNA, seq[3 * i - 1]) + c = reinterpret(RNA, seq[3 * i - 0]) + aa = + if BioSequences.isambiguous(a) | + BioSequences.isambiguous(b) | + BioSequences.isambiguous(c) + aa_ = BioSequences.try_translate_ambiguous_codon(code, a, b, c) + if aa_ === nothing + if allow_ambiguous_codons + aa_ = AA_X + else + error("codon ", a, b, c, " cannot be unambiguously translated") + end end + aa_ + else + code[BioSequences.unambiguous_codon(a, b, c)] end - aa_ - else - code[BioSequences.unambiguous_codon(a, b, c)] - end enc_data = reinterpret(UInt8, aa) % UInt64 data = leftshift_carry(data, 8, enc_data) end diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index 4476956..c31c338 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -2,14 +2,15 @@ # TODO: this should end up in BioSequences.jl? "Extract the element stored in a packed bitarray referred to by bidx." -@inline function BioSequences.extract_encoded_element(bidx::BioSequences.BitIndex{N,W}, data::NTuple{n,W}) where {N,n,W} +@inline function BioSequences.extract_encoded_element( + bidx::BioSequences.BitIndex{N, W}, + data::NTuple{n, W}, +) where {N, n, W} @inbounds chunk = data[BioSequences.index(bidx)] offchunk = chunk >> (BioSequences.bitwidth(W) - N - BioSequences.offset(bidx)) return offchunk & BioSequences.bitmask(bidx) end - - """ _cliphead(by::Integer, head::UInt64, tail...) @@ -35,17 +36,32 @@ pushfirst and pushlast can be efficiently implemented without duplication of cod or less efficient implementations that first shift and then insert an element. =# -@inline function rightshift_carry(x::NTuple{N,UInt64}, nbits::Integer, prevcarry = zero(UInt64)) where {N} +@inline function rightshift_carry( + x::NTuple{N, UInt64}, + nbits::Integer, + prevcarry=zero(UInt64), +) where {N} return _rightshift_carry(nbits, prevcarry, x...) end @inline function _rightshift_carry(nbits::Integer, carry::UInt64, head::UInt64, tail...) - return ((head >> nbits) | carry, _rightshift_carry(nbits, (head & ((one(UInt64) << nbits) - 1)) << (64 - nbits), tail...)...) + return ( + (head >> nbits) | carry, + _rightshift_carry( + nbits, + (head & ((one(UInt64) << nbits) - 1)) << (64 - nbits), + tail..., + )..., + ) end @inline _rightshift_carry(nbits::Integer, carry::UInt64) = () -@inline function leftshift_carry(x::NTuple{N,UInt64}, nbits::Integer, prevcarry::UInt64 = zero(UInt64)) where {N} +@inline function leftshift_carry( + x::NTuple{N, UInt64}, + nbits::Integer, + prevcarry::UInt64=zero(UInt64), +) where {N} _, newbits = _leftshift_carry(nbits, prevcarry, x...) return newbits end @@ -57,7 +73,11 @@ end @inline _leftshift_carry(nbits::Integer, prevcarry::UInt64) = prevcarry, () -@inline function _reverse(bpe::BioSequences.BitsPerSymbol{N}, head::UInt64, tail...) where {N} +@inline function _reverse( + bpe::BioSequences.BitsPerSymbol{N}, + head::UInt64, + tail..., +) where {N} return (_reverse(bpe, tail...)..., BioSequences.reversebits(head, bpe)) end @@ -69,4 +89,4 @@ end end @inline _reverse(f::F, ::BioSequences.BitsPerSymbol{N}) where {N,F<:Function} = () -=# \ No newline at end of file +=# diff --git a/test/access.jl b/test/access.jl index aa4a35e..71dd487 100644 --- a/test/access.jl +++ b/test/access.jl @@ -1,14 +1,14 @@ @testset "Access and Iterations" begin dna_kmer = mer"ACTG"dna rna_kmer = mer"ACUG"rna - aa_kmer = mer"MVXN"aa + aa_kmer = mer"MVXN"aa @testset "Access DNA Kmer" begin @test dna_kmer[1] == DNA_A @test dna_kmer[2] == DNA_C @test dna_kmer[3] == DNA_T @test dna_kmer[4] == DNA_G - + @test dna_kmer[1:3] == mer"ACT"dna @test dna_kmer[2:4] == mer"CTG"dna @@ -16,7 +16,7 @@ @test_throws BoundsError dna_kmer[-1] @test_throws BoundsError dna_kmer[0] @test_throws BoundsError dna_kmer[5] - @test_throws BoundsError getindex(dna_kmer,-1) + @test_throws BoundsError getindex(dna_kmer, -1) @test_throws BoundsError getindex(dna_kmer, 0) @test_throws BoundsError getindex(dna_kmer, 5) @test_throws BoundsError dna_kmer[3:7] @@ -28,9 +28,9 @@ @test iterate(DNAKmer("ACTG"), 1) == (DNA_A, 2) @test iterate(DNAKmer("ACTG"), 4) == (DNA_G, 5) - @test iterate(DNAKmer("ACTG"), 1) !== nothing - @test iterate(DNAKmer("ACTG"), 4) !== nothing - @test iterate(DNAKmer("ACTG"), 5) === nothing + @test iterate(DNAKmer("ACTG"), 1) !== nothing + @test iterate(DNAKmer("ACTG"), 4) !== nothing + @test iterate(DNAKmer("ACTG"), 5) === nothing @test isnothing(iterate(DNAKmer("ACTG"), -1)) @test iterate(DNAKmer("ACTG"), 0) === nothing @@ -43,7 +43,7 @@ @test rna_kmer[2] == RNA_C @test rna_kmer[3] == RNA_U @test rna_kmer[4] == RNA_G - + @test rna_kmer[1:3] == mer"ACU"rna @test rna_kmer[2:4] == mer"CUG"rna @@ -59,22 +59,20 @@ @testset "Iteration through RNA Kmer" begin @test iterate(RNAKmer("ACUG")) == (RNA_A, 2) - @test iterate(RNAKmer("ACUG"), 1) == (RNA_A, 2) @test iterate(RNAKmer("ACUG"), 4) == (RNA_G, 5) - @test iterate(RNAKmer("ACUG"), 1) !== nothing - @test iterate(RNAKmer("ACUG"), 4) !== nothing - @test iterate(RNAKmer("ACUG"), 5) === nothing + @test iterate(RNAKmer("ACUG"), 4) !== nothing + @test iterate(RNAKmer("ACUG"), 5) === nothing @test iterate(RNAKmer("ACUG"), -1) === nothing @test iterate(RNAKmer("ACUG"), 0) === nothing rna_vec = [RNA_A, RNA_C, RNA_U, RNA_G] @test all([nt === rna_vec[i] for (i, nt) in enumerate(rna_kmer)]) end - + @testset "Access AA Kmer" begin @test aa_kmer[1] == AA_M @test aa_kmer[2] == AA_V @@ -88,7 +86,7 @@ @test_throws BoundsError aa_kmer[-1] @test_throws BoundsError aa_kmer[0] @test_throws BoundsError aa_kmer[5] - @test_throws BoundsError getindex(aa_kmer,-1) + @test_throws BoundsError getindex(aa_kmer, -1) @test_throws BoundsError getindex(aa_kmer, 0) @test_throws BoundsError getindex(aa_kmer, 5) @test_throws BoundsError aa_kmer[3:7] diff --git a/test/biosequences_interface.jl b/test/biosequences_interface.jl index 5db38a3..a52680b 100644 --- a/test/biosequences_interface.jl +++ b/test/biosequences_interface.jl @@ -1,12 +1,51 @@ @testset "BioSequences Interface" begin - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{DNAAlphabet{2},31}), rand(ACGT, 31), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{DNAAlphabet{4},31}), rand(ACGT, 31), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{RNAAlphabet{2},31}), rand(ACGU, 31), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{RNAAlphabet{4},31}), rand(ACGU, 31), false) - - - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{DNAAlphabet{2},200}), rand(ACGT, 200), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{DNAAlphabet{4},200}), rand(ACGT, 200), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{RNAAlphabet{2},200}), rand(ACGU, 200), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{RNAAlphabet{4},200}), rand(ACGU, 200), false) -end \ No newline at end of file + @test BioSequences.has_interface( + BioSequence, + Kmers.kmertype(Kmer{DNAAlphabet{2}, 31}), + rand(ACGT, 31), + false, + ) + @test BioSequences.has_interface( + BioSequence, + Kmers.kmertype(Kmer{DNAAlphabet{4}, 31}), + rand(ACGT, 31), + false, + ) + @test BioSequences.has_interface( + BioSequence, + Kmers.kmertype(Kmer{RNAAlphabet{2}, 31}), + rand(ACGU, 31), + false, + ) + @test BioSequences.has_interface( + BioSequence, + Kmers.kmertype(Kmer{RNAAlphabet{4}, 31}), + rand(ACGU, 31), + false, + ) + + @test BioSequences.has_interface( + BioSequence, + Kmers.kmertype(Kmer{DNAAlphabet{2}, 200}), + rand(ACGT, 200), + false, + ) + @test BioSequences.has_interface( + BioSequence, + Kmers.kmertype(Kmer{DNAAlphabet{4}, 200}), + rand(ACGT, 200), + false, + ) + @test BioSequences.has_interface( + BioSequence, + Kmers.kmertype(Kmer{RNAAlphabet{2}, 200}), + rand(ACGU, 200), + false, + ) + @test BioSequences.has_interface( + BioSequence, + Kmers.kmertype(Kmer{RNAAlphabet{4}, 200}), + rand(ACGU, 200), + false, + ) +end diff --git a/test/comparisons.jl b/test/comparisons.jl index 78bf48a..9c84e05 100644 --- a/test/comparisons.jl +++ b/test/comparisons.jl @@ -13,32 +13,32 @@ # True negatives @test DNAKmer("ACG") != RNAKmer("ACG") - @test DNAKmer("T") != RNAKmer("U") - @test DNAKmer("AC") != DNAKmer("AG") - @test RNAKmer("AC") != RNAKmer("AG") - @test AAKmer("MV") != AAKmer("NM") + @test DNAKmer("T") != RNAKmer("U") + @test DNAKmer("AC") != DNAKmer("AG") + @test RNAKmer("AC") != RNAKmer("AG") + @test AAKmer("MV") != AAKmer("NM") @test DNAKmer("ACG") != rna"ACG" - @test DNAKmer("T") != rna"U" - @test DNAKmer("AC") != dna"AG" - @test RNAKmer("AC") != rna"AG" - @test AAKmer("MV") != aa"NM" + @test DNAKmer("T") != rna"U" + @test DNAKmer("AC") != dna"AG" + @test RNAKmer("AC") != rna"AG" + @test AAKmer("MV") != aa"NM" @test rna"ACG" != DNAKmer("ACG") - @test rna"U" != DNAKmer("T") - @test dna"AG" != DNAKmer("AC") - @test rna"AG" != RNAKmer("AC") - @test aa"MV" != AAKmer("NM") + @test rna"U" != DNAKmer("T") + @test dna"AG" != DNAKmer("AC") + @test rna"AG" != RNAKmer("AC") + @test aa"MV" != AAKmer("NM") end @testset "Inequality" begin for len in [1, 10, 32, 64] if len <= 32 - @test isless(DNAKmer{1}((UInt64(0),)), DNAKmer{1}((UInt64(1),))) + @test isless(DNAKmer{1}((UInt64(0),)), DNAKmer{1}((UInt64(1),))) @test !isless(DNAKmer{1}((UInt64(0),)), DNAKmer{1}((UInt64(0),))) @test !isless(DNAKmer{1}((UInt64(1),)), DNAKmer{1}((UInt64(0),))) - - @test isless(RNAKmer{1}((UInt64(0),)), RNAKmer{1}((UInt64(1),))) + + @test isless(RNAKmer{1}((UInt64(0),)), RNAKmer{1}((UInt64(1),))) @test !isless(RNAKmer{1}((UInt64(0),)), RNAKmer{1}((UInt64(0),))) @test !isless(RNAKmer{1}((UInt64(1),)), RNAKmer{1}((UInt64(0),))) end @@ -50,12 +50,12 @@ for x in kmers, y in kmers @test (x == y) == (hash(x) == hash(y)) end - + kmers = map(RNAKmer, ["AAAA", "AACU", "ACGU", "UGCA"]) for x in kmers, y in kmers @test (x == y) == (hash(x) == hash(y)) end - + kmers = map(AAKmer, ["AMVK", "FPST", "QEGH", "ARND"]) for x in kmers, y in kmers @test (x == y) == (hash(x) == hash(y)) diff --git a/test/construction_and_conversion.jl b/test/construction_and_conversion.jl index 567042c..68c1465 100644 --- a/test/construction_and_conversion.jl +++ b/test/construction_and_conversion.jl @@ -7,40 +7,48 @@ global reps = 10 @test DNAKmer(DNA_G, DNA_C, DNA_T) == Kmer("GCT") @test RNAKmer(RNA_G, RNA_U, RNA_C, RNA_U) == Kmer("GUCU") - + # creation from iterator - @test Kmers.kmertype(Kmer{DNAAlphabet{2},31})((i for i in rand(ACGT, 31))) isa Kmers.kmertype(Kmer{DNAAlphabet{2},31}) - + @test Kmers.kmertype(Kmer{DNAAlphabet{2}, 31})((i for i in rand(ACGT, 31))) isa + Kmers.kmertype(Kmer{DNAAlphabet{2}, 31}) + # Check that kmers in strings survive round trip conversion: # String → Kmer → String - function check_string_construction(::Type{T}, seq::AbstractString) where {T<:Kmer} + function check_string_construction(::Type{T}, seq::AbstractString) where {T <: Kmer} return String(T(seq)) == uppercase(seq) end - + # Check that RNAKmers can be constructed from a LongRNASeq # LongSequence{A} → Kmer{A,K,N} → LongSequence{A} - function check_longsequence_construction(::Type{T}, seq::S) where {T<:Kmer,S<:LongSequence} + function check_longsequence_construction( + ::Type{T}, + seq::S, + ) where {T <: Kmer, S <: LongSequence} return S(T(seq)) == seq end # Check that kmers can be constructed from a BioSequence # BioSequence → Kmer → BioSequence - function check_biosequence_construction(::Type{T}, seq::LongSequence) where {T<:Kmer} + function check_biosequence_construction(::Type{T}, seq::LongSequence) where {T <: Kmer} return LongSequence(T(seq)) == seq end # Check that kmers can be constructed from an array of nucleotides # Vector{T} → Kmer → Vector{T} - function check_nucarray_kmer(::Type{M}, seq::Vector{T}) where {T,M<:Kmer} + function check_nucarray_kmer(::Type{M}, seq::Vector{T}) where {T, M <: Kmer} return String([convert(Char, c) for c in seq]) == String(M(seq)) end # Check that kmers in strings survive round trip conversion: # String → BioSequence → Kmer → BioSequence → String - function check_roundabout_construction(::Type{T}, A2, seq::AbstractString) where {T<:Kmer} + function check_roundabout_construction( + ::Type{T}, + A2, + seq::AbstractString, + ) where {T <: Kmer} return String(LongSequence{A2}(T(LongSequence{A2}(seq)))) == uppercase(seq) end - + #= function check_uint_conversion(::Type{T}) where {T<:Kmer} U = BioSequences.encoded_data_type(T) @@ -54,68 +62,362 @@ global reps = 10 # String construction # Check that kmers in strings survive round trip conversion: # String → Kmer → String - @test all(Bool[check_string_construction(DNAKmer{len}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_string_construction(Kmer{DNAAlphabet{4},len}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_string_construction(RNAKmer{len}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_string_construction(Kmer{RNAAlphabet{4},len}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_string_construction(AAKmer{len}, random_aa(len)) for _ in 1:reps]) - + @test all( + Bool[ + check_string_construction(DNAKmer{len}, random_dna_kmer(len)) for + _ in 1:reps + ], + ) + @test all( + Bool[ + check_string_construction( + Kmer{DNAAlphabet{4}, len}, + random_dna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_string_construction(RNAKmer{len}, random_rna_kmer(len)) for + _ in 1:reps + ], + ) + @test all( + Bool[ + check_string_construction( + Kmer{RNAAlphabet{4}, len}, + random_rna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[check_string_construction(AAKmer{len}, random_aa(len)) for _ in 1:reps + ], + ) + # Long(DNA|RNA)Seq Constructions # Check that DNAKmers can be constructed from a Long(DNA|RNA)Seq # Long(DNA|RNA)Seq → Kmer → Long(DNA|RNA)Seq - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{2},len}, LongDNA{2}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{4},len}, LongDNA{4}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{4},len}, LongDNA{2}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{2},len}, LongDNA{4}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{2},len}, LongRNA{2}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{4},len}, LongRNA{4}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{4},len}, LongRNA{2}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{2},len}, LongRNA{4}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(AAKmer{len}, LongAA(random_aa(len))) for _ in 1:reps]) - + @test all( + Bool[ + check_longsequence_construction( + Kmer{DNAAlphabet{2}, len}, + LongDNA{2}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{DNAAlphabet{4}, len}, + LongDNA{4}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{DNAAlphabet{4}, len}, + LongDNA{2}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{DNAAlphabet{2}, len}, + LongDNA{4}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{RNAAlphabet{2}, len}, + LongRNA{2}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{RNAAlphabet{4}, len}, + LongRNA{4}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{RNAAlphabet{4}, len}, + LongRNA{2}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{RNAAlphabet{2}, len}, + LongRNA{4}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction(AAKmer{len}, LongAA(random_aa(len))) for + _ in 1:reps + ], + ) + # Check Kmer{A1}(::BioSequence{A2}) for compatible A1 and A2 - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{4}}, LongRNA{2}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{2}}, LongDNA{4}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{4}}, LongDNA{4}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{2}}, LongRNA{4}(random_rna_kmer(len))) for _ in 1:reps]) + @test all( + Bool[ + check_longsequence_construction( + Kmer{RNAAlphabet{4}}, + LongRNA{2}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{RNAAlphabet{2}}, + LongDNA{4}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{RNAAlphabet{4}}, + LongDNA{4}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction( + Kmer{DNAAlphabet{2}}, + LongRNA{4}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) # BioSequence Construction # Check that kmers can be constructed from a BioSequence # BioSequence → Kmer → BioSequence - @test all(Bool[check_biosequence_construction(Kmer{DNAAlphabet{2},len}, LongSequence{DNAAlphabet{2}}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{DNAAlphabet{4},len}, LongSequence{DNAAlphabet{4}}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{DNAAlphabet{2},len}, LongSequence{DNAAlphabet{4}}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{DNAAlphabet{4},len}, LongSequence{DNAAlphabet{2}}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{RNAAlphabet{2},len}, LongSequence{RNAAlphabet{2}}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{RNAAlphabet{4},len}, LongSequence{RNAAlphabet{4}}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{RNAAlphabet{2},len}, LongSequence{RNAAlphabet{4}}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{RNAAlphabet{4},len}, LongSequence{RNAAlphabet{2}}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(AAKmer{len}, LongSequence{AminoAcidAlphabet}(random_aa(len))) for _ in 1:reps]) + @test all( + Bool[ + check_biosequence_construction( + Kmer{DNAAlphabet{2}, len}, + LongSequence{DNAAlphabet{2}}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_biosequence_construction( + Kmer{DNAAlphabet{4}, len}, + LongSequence{DNAAlphabet{4}}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_biosequence_construction( + Kmer{DNAAlphabet{2}, len}, + LongSequence{DNAAlphabet{4}}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_biosequence_construction( + Kmer{DNAAlphabet{4}, len}, + LongSequence{DNAAlphabet{2}}(random_dna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_biosequence_construction( + Kmer{RNAAlphabet{2}, len}, + LongSequence{RNAAlphabet{2}}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_biosequence_construction( + Kmer{RNAAlphabet{4}, len}, + LongSequence{RNAAlphabet{4}}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_biosequence_construction( + Kmer{RNAAlphabet{2}, len}, + LongSequence{RNAAlphabet{4}}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_biosequence_construction( + Kmer{RNAAlphabet{4}, len}, + LongSequence{RNAAlphabet{2}}(random_rna_kmer(len)), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_biosequence_construction( + AAKmer{len}, + LongSequence{AminoAcidAlphabet}(random_aa(len)), + ) for _ in 1:reps + ], + ) # Check Kmer(::BioSequence) construction - @test all(Bool[check_longsequence_construction(Kmer, LongRNA{4}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer, LongDNA{2}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer, LongAA(random_rna_kmer(len))) for _ in 1:reps]) + @test all( + Bool[ + check_longsequence_construction(Kmer, LongRNA{4}(random_rna_kmer(len))) + for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction(Kmer, LongDNA{2}(random_dna_kmer(len))) + for _ in 1:reps + ], + ) + @test all( + Bool[ + check_longsequence_construction(Kmer, LongAA(random_rna_kmer(len))) for + _ in 1:reps + ], + ) # Construction from element arrays # Check that kmers can be constructed from an array of elements # Vector{T} → Kmer{A,K,N} → Vector{T} - @test all(Bool[check_nucarray_kmer(Kmer{DNAAlphabet{2},len}, random_dna_symbols(len, [0.25, 0.25, 0.25, 0.25, 0.0])) for _ in 1:reps]) - @test all(Bool[check_nucarray_kmer(Kmer{DNAAlphabet{4},len}, random_dna_symbols(len)) for _ in 1:reps]) - @test all(Bool[check_nucarray_kmer(Kmer{RNAAlphabet{2},len}, random_rna_symbols(len, [0.25, 0.25, 0.25, 0.25, 0.0])) for _ in 1:reps]) - @test all(Bool[check_nucarray_kmer(Kmer{RNAAlphabet{4},len}, random_rna_symbols(len)) for _ in 1:reps]) - @test all(Bool[check_nucarray_kmer(AAKmer{len}, random_aa_symbols(len)) for _ in 1:reps]) - + @test all( + Bool[ + check_nucarray_kmer( + Kmer{DNAAlphabet{2}, len}, + random_dna_symbols(len, [0.25, 0.25, 0.25, 0.25, 0.0]), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_nucarray_kmer(Kmer{DNAAlphabet{4}, len}, random_dna_symbols(len)) + for _ in 1:reps + ], + ) + @test all( + Bool[ + check_nucarray_kmer( + Kmer{RNAAlphabet{2}, len}, + random_rna_symbols(len, [0.25, 0.25, 0.25, 0.25, 0.0]), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_nucarray_kmer(Kmer{RNAAlphabet{4}, len}, random_rna_symbols(len)) + for _ in 1:reps + ], + ) + @test all( + Bool[ + check_nucarray_kmer(AAKmer{len}, random_aa_symbols(len)) for _ in 1:reps + ], + ) + # Roundabout conversions - @test all(Bool[check_roundabout_construction(Kmer{DNAAlphabet{2},len}, DNAAlphabet{2}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{DNAAlphabet{4},len}, DNAAlphabet{4}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{DNAAlphabet{2},len}, DNAAlphabet{4}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{DNAAlphabet{4},len}, DNAAlphabet{2}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{RNAAlphabet{2},len}, RNAAlphabet{2}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{RNAAlphabet{4},len}, RNAAlphabet{4}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{RNAAlphabet{2},len}, RNAAlphabet{4}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{RNAAlphabet{4},len}, RNAAlphabet{2}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(AAKmer{len}, AminoAcidAlphabet, random_aa(len)) for _ in 1:reps]) + @test all( + Bool[ + check_roundabout_construction( + Kmer{DNAAlphabet{2}, len}, + DNAAlphabet{2}, + random_dna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_roundabout_construction( + Kmer{DNAAlphabet{4}, len}, + DNAAlphabet{4}, + random_dna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_roundabout_construction( + Kmer{DNAAlphabet{2}, len}, + DNAAlphabet{4}, + random_dna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_roundabout_construction( + Kmer{DNAAlphabet{4}, len}, + DNAAlphabet{2}, + random_dna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_roundabout_construction( + Kmer{RNAAlphabet{2}, len}, + RNAAlphabet{2}, + random_rna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_roundabout_construction( + Kmer{RNAAlphabet{4}, len}, + RNAAlphabet{4}, + random_rna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_roundabout_construction( + Kmer{RNAAlphabet{2}, len}, + RNAAlphabet{4}, + random_rna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_roundabout_construction( + Kmer{RNAAlphabet{4}, len}, + RNAAlphabet{2}, + random_rna_kmer(len), + ) for _ in 1:reps + ], + ) + @test all( + Bool[ + check_roundabout_construction( + AAKmer{len}, + AminoAcidAlphabet, + random_aa(len), + ) for _ in 1:reps + ], + ) end end @@ -141,13 +443,13 @@ global reps = 10 # Test string literals @test mer"ACTG"dna == DNAKmer(LongDNA{4}("ACTG")) - @test mer"AVBM"aa == AAKmer(LongAA("AVBM")) + @test mer"AVBM"aa == AAKmer(LongAA("AVBM")) @test isa(mer"ACGT"dna, DNAKmer{4}) - @test isa(mer"AVBM"aa, AAKmer{4}) + @test isa(mer"AVBM"aa, AAKmer{4}) @test_throws LoadError eval(:(mer"ACGN"dna)) @test_throws LoadError eval(:(mer"ACG-"dna)) end - + @testset "Capacity" begin @test Kmers.capacity(DNAKmer(random_dna_kmer(10))) == 32 @test Kmers.capacity(RNAKmer(random_rna_kmer(10))) == 32 @@ -157,7 +459,7 @@ global reps = 10 @test Kmers.capacity(AAKmer(random_aa(8))) == 8 @test Kmers.capacity(AAKmer(random_aa(10))) == 16 end - + @testset "N unused" begin @test Kmers.n_unused(DNAKmer(random_dna_kmer(10))) == 22 @test Kmers.n_unused(RNAKmer(random_rna_kmer(10))) == 22 diff --git a/test/debruijn_neighbors.jl b/test/debruijn_neighbors.jl index c011b84..1f92fd6 100644 --- a/test/debruijn_neighbors.jl +++ b/test/debruijn_neighbors.jl @@ -1,6 +1,10 @@ @testset "De Bruijn Neighbors" begin - @test collect(fw_neighbors(DNAKmer("ACG"))) == map(DNAKmer, ["CGA", "CGC", "CGG", "CGT"]) - @test collect(fw_neighbors(DNAKmer("GGGG"))) == map(DNAKmer, ["GGGA", "GGGC", "GGGG", "GGGT"]) - @test collect(fw_neighbors(RNAKmer("ACG"))) == map(RNAKmer, ["CGA", "CGC", "CGG", "CGU"]) - @test collect(fw_neighbors(RNAKmer("GGGG"))) == map(RNAKmer, ["GGGA", "GGGC", "GGGG", "GGGU"]) + @test collect(fw_neighbors(DNAKmer("ACG"))) == + map(DNAKmer, ["CGA", "CGC", "CGG", "CGT"]) + @test collect(fw_neighbors(DNAKmer("GGGG"))) == + map(DNAKmer, ["GGGA", "GGGC", "GGGG", "GGGT"]) + @test collect(fw_neighbors(RNAKmer("ACG"))) == + map(RNAKmer, ["CGA", "CGC", "CGG", "CGU"]) + @test collect(fw_neighbors(RNAKmer("GGGG"))) == + map(RNAKmer, ["GGGA", "GGGC", "GGGG", "GGGU"]) end diff --git a/test/find.jl b/test/find.jl index c3d4aaa..b8cab44 100644 --- a/test/find.jl +++ b/test/find.jl @@ -23,9 +23,9 @@ @test findfirst(DNA_G, kmer) == 3 @test findlast(DNA_A, kmer) == 4 @test findlast(DNA_G, kmer) == 5 - + kmer = AAKmer("AMVKFPSMT") - + @test findnext(AA_A, kmer, 1) == 1 @test findnext(AA_M, kmer, 1) == 2 @test findnext(AA_V, kmer, 1) == 3 @@ -35,12 +35,12 @@ @test findnext(AA_S, kmer, 1) == 7 @test findnext(AA_M, kmer, 1) == 2 @test findnext(AA_T, kmer, 1) == 9 - + @test findnext(AA_F, kmer, 4) == 5 @test findprev(AA_F, kmer, 4) == nothing @test findnext(AA_A, kmer, 7) == nothing @test findnext(AA_M, kmer, 5) == 8 - + @test findfirst(AA_M, kmer) == 2 @test findlast(AA_M, kmer) == 8 end diff --git a/test/iteration.jl b/test/iteration.jl index 801a4b0..098eab5 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -5,42 +5,44 @@ # Kmer and sequence Alphabets match. @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - + @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 - + # Kmer and sequence Alphabets mismatch. s3 = dna"AC-TGAG--TGC" - @test collect(EveryKmer{DNACodon}(s3)) == - [(UInt64(4), Kmer(DNA_T, DNA_G, DNA_A)), - (UInt64(5), Kmer(DNA_G, DNA_A, DNA_G)), - (UInt64(10), Kmer(DNA_T, DNA_G, DNA_C))] + @test collect(EveryKmer{DNACodon}(s3)) == [ + (UInt64(4), Kmer(DNA_T, DNA_G, DNA_A)), + (UInt64(5), Kmer(DNA_G, DNA_A, DNA_G)), + (UInt64(10), Kmer(DNA_T, DNA_G, DNA_C)), + ] end - + @testset "EveryKmer RNA" begin s = randrnaseq(500) s2 = LongRNA{2}(s) # Kmer and sequence Alphabets match. @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - + @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 - + # Kmer and sequence Alphabets mismatch. s3 = rna"AC-UGAG--UGC" - @test collect(EveryKmer{RNACodon}(s3)) == - [(UInt64(4), Kmer(RNA_U, RNA_G, RNA_A)), - (UInt64(5), Kmer(RNA_G, RNA_A, RNA_G)), - (UInt64(10), Kmer(RNA_U, RNA_G, RNA_C))] + @test collect(EveryKmer{RNACodon}(s3)) == [ + (UInt64(4), Kmer(RNA_U, RNA_G, RNA_A)), + (UInt64(5), Kmer(RNA_G, RNA_A, RNA_G)), + (UInt64(10), Kmer(RNA_U, RNA_G, RNA_C)), + ] end - + @testset "EveryKmer AA" begin s = randaaseq(500) s2 = LongAA(s) @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - + @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 end @@ -50,41 +52,61 @@ end @testset "SpacedKmers DNA" begin s = randdnaseq(500) s2 = LongDNA{2}(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == length(SpacedKmers(s2, Val{31}(), 50)) == 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == length(SpacedKmers(s2, Val{201}(), 50)) == 6 - + @test collect(SpacedKmers(s, Val{31}(), 50)) == + collect(SpacedKmers(s2, Val{31}(), 50)) + @test length(SpacedKmers(s, Val{31}(), 50)) == + length(SpacedKmers(s2, Val{31}(), 50)) == + 10 + + @test collect(SpacedKmers(s, Val{201}(), 50)) == + collect(SpacedKmers(s2, Val{201}(), 50)) + @test length(SpacedKmers(s, Val{201}(), 50)) == + length(SpacedKmers(s2, Val{201}(), 50)) == + 6 + s3 = dna"AC-TGAG--TGC" - @test collect(SpacedKmers{DNACodon}(s3, 3)) == - [(UInt64(4), Kmer(DNA_T, DNA_G, DNA_A)), - (UInt64(10), Kmer(DNA_T, DNA_G, DNA_C))] + @test collect(SpacedKmers{DNACodon}(s3, 3)) == [ + (UInt64(4), Kmer(DNA_T, DNA_G, DNA_A)), + (UInt64(10), Kmer(DNA_T, DNA_G, DNA_C)), + ] end - + @testset "SpacedKmers RNA" begin s = randrnaseq(500) s2 = LongRNA{2}(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == length(SpacedKmers(s2, Val{31}(), 50)) == 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == length(SpacedKmers(s2, Val{201}(), 50)) == 6 - + @test collect(SpacedKmers(s, Val{31}(), 50)) == + collect(SpacedKmers(s2, Val{31}(), 50)) + @test length(SpacedKmers(s, Val{31}(), 50)) == + length(SpacedKmers(s2, Val{31}(), 50)) == + 10 + + @test collect(SpacedKmers(s, Val{201}(), 50)) == + collect(SpacedKmers(s2, Val{201}(), 50)) + @test length(SpacedKmers(s, Val{201}(), 50)) == + length(SpacedKmers(s2, Val{201}(), 50)) == + 6 + s3 = rna"AC-UGAG--UGC" - @test collect(SpacedKmers{RNACodon}(s3, 3)) == - [(UInt64(4), Kmer(RNA_U, RNA_G, RNA_A)), - (UInt64(10), Kmer(RNA_U, RNA_G, RNA_C))] + @test collect(SpacedKmers{RNACodon}(s3, 3)) == [ + (UInt64(4), Kmer(RNA_U, RNA_G, RNA_A)), + (UInt64(10), Kmer(RNA_U, RNA_G, RNA_C)), + ] end - + @testset "SpacedKmers AA" begin s = randaaseq(500) s2 = LongAA(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == length(SpacedKmers(s2, Val{31}(), 50)) == 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == length(SpacedKmers(s2, Val{201}(), 50)) == 6 + @test collect(SpacedKmers(s, Val{31}(), 50)) == + collect(SpacedKmers(s2, Val{31}(), 50)) + @test length(SpacedKmers(s, Val{31}(), 50)) == + length(SpacedKmers(s2, Val{31}(), 50)) == + 10 + + @test collect(SpacedKmers(s, Val{201}(), 50)) == + collect(SpacedKmers(s2, Val{201}(), 50)) + @test length(SpacedKmers(s, Val{201}(), 50)) == + length(SpacedKmers(s2, Val{201}(), 50)) == + 6 end end @@ -92,76 +114,62 @@ end @testset "EveryCanonicalKmer DNA" begin s = randdnaseq(500) s2 = LongDNA{2}(s) - + # Iterator generates expected results... ## 2-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{31}())] == - collect(EveryCanonicalKmer(s2, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{201}())] == - collect(EveryCanonicalKmer(s2, Val{201}())) - + @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{31}())] == collect(EveryCanonicalKmer(s2, Val{31}())) + + @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{201}())] == collect(EveryCanonicalKmer(s2, Val{201}())) + ## 4-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{31}())] == - collect(EveryCanonicalKmer(s, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{201}())] == - collect(EveryCanonicalKmer(s, Val{201}())) - + @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{31}())] == collect(EveryCanonicalKmer(s, Val{31}())) + + @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{201}())] == collect(EveryCanonicalKmer(s, Val{201}())) + # Test equivalency between different levels of bit compression... - @test [x[2] for x in EveryCanonicalKmer(s, Val{31}())] == - [x[2] for x in EveryCanonicalKmer(s2, Val{31}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{31}())])) && - all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{31}())])) - - @test [x[2] for x in EveryCanonicalKmer(s, Val{201}())] == - [x[2] for x in EveryCanonicalKmer(s2, Val{201}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{201}())])) && - all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{201}())])) - + @test [x[2] for x in EveryCanonicalKmer(s, Val{31}())] == [x[2] for x in EveryCanonicalKmer(s2, Val{31}())] + @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{31}())])) && all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{31}())])) + + @test [x[2] for x in EveryCanonicalKmer(s, Val{201}())] == [x[2] for x in EveryCanonicalKmer(s2, Val{201}())] + @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{201}())])) && all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{201}())])) + # Kmer and sequence Alphabets mismatch. s3 = dna"AC-TGAG--TGC" - @test collect(EveryCanonicalKmer{DNACodon}(s3)) == - [(UInt64(4), canonical(Kmer(DNA_T, DNA_G, DNA_A))), - (UInt64(5), canonical(Kmer(DNA_G, DNA_A, DNA_G))), - (UInt64(10), canonical(Kmer(DNA_T, DNA_G, DNA_C)))] + @test collect(EveryCanonicalKmer{DNACodon}(s3)) == [ + (UInt64(4), canonical(Kmer(DNA_T, DNA_G, DNA_A))), + (UInt64(5), canonical(Kmer(DNA_G, DNA_A, DNA_G))), + (UInt64(10), canonical(Kmer(DNA_T, DNA_G, DNA_C))), + ] end - + @testset "EveryCanonicalKmer RNA" begin s = randrnaseq(500) s2 = LongRNA{2}(s) - + # Iterator generates expected results... ## 2-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{31}())] == - collect(EveryCanonicalKmer(s2, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{201}())] == - collect(EveryCanonicalKmer(s2, Val{201}())) - + @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{31}())] == collect(EveryCanonicalKmer(s2, Val{31}())) + + @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{201}())] == collect(EveryCanonicalKmer(s2, Val{201}())) + ## 4-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{31}())] == - collect(EveryCanonicalKmer(s, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{201}())] == - collect(EveryCanonicalKmer(s, Val{201}())) - + @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{31}())] == collect(EveryCanonicalKmer(s, Val{31}())) + + @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{201}())] == collect(EveryCanonicalKmer(s, Val{201}())) + # Test equivalency between different levels of bit compression... - @test [x[2] for x in EveryCanonicalKmer(s, Val{31}())] == - [x[2] for x in EveryCanonicalKmer(s2, Val{31}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{31}())])) && - all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{31}())])) - - @test [x[2] for x in EveryCanonicalKmer(s, Val{201}())] == - [x[2] for x in EveryCanonicalKmer(s2, Val{201}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{201}())])) && - all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{201}())])) - + @test [x[2] for x in EveryCanonicalKmer(s, Val{31}())] == [x[2] for x in EveryCanonicalKmer(s2, Val{31}())] + @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{31}())])) && all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{31}())])) + + @test [x[2] for x in EveryCanonicalKmer(s, Val{201}())] == [x[2] for x in EveryCanonicalKmer(s2, Val{201}())] + @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{201}())])) && all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{201}())])) + s3 = rna"AC-UGAG--UGC" - @test collect(EveryCanonicalKmer{RNACodon}(s3)) == - [(UInt64(4), canonical(Kmer(RNA_U, RNA_G, RNA_A))), - (UInt64(5), canonical(Kmer(RNA_G, RNA_A, RNA_G))), - (UInt64(10), canonical(Kmer(RNA_U, RNA_G, RNA_C)))] + @test collect(EveryCanonicalKmer{RNACodon}(s3)) == [ + (UInt64(4), canonical(Kmer(RNA_U, RNA_G, RNA_A))), + (UInt64(5), canonical(Kmer(RNA_G, RNA_A, RNA_G))), + (UInt64(10), canonical(Kmer(RNA_U, RNA_G, RNA_C))), + ] end end @@ -173,22 +181,29 @@ end @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{31}(), 50)) == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{31}(), 50)) == length(SpacedCanonicalKmers(s2, Val{31}(), 50)) == 10 - + @test collect(SpacedCanonicalKmers(s, Val{31}(), 50)) == + collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) + @test length(SpacedCanonicalKmers(s, Val{31}(), 50)) == + length(SpacedCanonicalKmers(s2, Val{31}(), 50)) == + 10 + @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{201}(), 50)) == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{201}(), 50)) == length(SpacedCanonicalKmers(s2, Val{201}(), 50)) == 6 - + @test collect(SpacedCanonicalKmers(s, Val{201}(), 50)) == + collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) + @test length(SpacedCanonicalKmers(s, Val{201}(), 50)) == + length(SpacedCanonicalKmers(s2, Val{201}(), 50)) == + 6 + s3 = dna"AC-TGAG--TGC" - @test collect(SpacedCanonicalKmers{DNACodon}(s3, 3)) == - [(UInt64(4), canonical(Kmer(DNA_T, DNA_C, DNA_A))), - (UInt64(10), canonical(Kmer(DNA_T, DNA_G, DNA_C)))] + @test collect(SpacedCanonicalKmers{DNACodon}(s3, 3)) == [ + (UInt64(4), canonical(Kmer(DNA_T, DNA_C, DNA_A))), + (UInt64(10), canonical(Kmer(DNA_T, DNA_G, DNA_C))), + ] end - + @testset "SpacedCanonicalKmers RNA" begin s = randrnaseq(500) s2 = LongRNA{2}(s) @@ -196,19 +211,26 @@ end @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{31}(), 50)) == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{31}(), 50)) == length(SpacedCanonicalKmers(s2, Val{31}(), 50)) == 10 - + @test collect(SpacedCanonicalKmers(s, Val{31}(), 50)) == + collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) + @test length(SpacedCanonicalKmers(s, Val{31}(), 50)) == + length(SpacedCanonicalKmers(s2, Val{31}(), 50)) == + 10 + @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{201}(), 50)) == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{201}(), 50)) == length(SpacedCanonicalKmers(s2, Val{201}(), 50)) == 6 - + @test collect(SpacedCanonicalKmers(s, Val{201}(), 50)) == + collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) + @test length(SpacedCanonicalKmers(s, Val{201}(), 50)) == + length(SpacedCanonicalKmers(s2, Val{201}(), 50)) == + 6 + s3 = rna"AC-UGAG--UGC" - @test collect(SpacedCanonicalKmers{RNACodon}(s3, 3)) == - [(UInt64(4), canonical(Kmer(RNA_U, RNA_C, RNA_A))), - (UInt64(10), canonical(Kmer(RNA_U, RNA_G, RNA_C)))] + @test collect(SpacedCanonicalKmers{RNACodon}(s3, 3)) == [ + (UInt64(4), canonical(Kmer(RNA_U, RNA_C, RNA_A))), + (UInt64(10), canonical(Kmer(RNA_U, RNA_G, RNA_C))), + ] end -end \ No newline at end of file +end diff --git a/test/mismatches.jl b/test/mismatches.jl index b8e1bb0..620797c 100644 --- a/test/mismatches.jl +++ b/test/mismatches.jl @@ -12,12 +12,12 @@ b = random_dna_kmer(len) test_mismatches(DNAKmer(a), DNAKmer(b)) test_mismatches(Kmer{DNAAlphabet{4}}(a), Kmer{DNAAlphabet{4}}(b)) - + a = random_rna_kmer(len) b = random_rna_kmer(len) test_mismatches(RNAKmer(a), RNAKmer(b)) test_mismatches(Kmer{RNAAlphabet{4}}(a), Kmer{RNAAlphabet{4}}(b)) - + a = AAKmer(random_aa(len)) b = AAKmer(random_aa(len)) test_mismatches(a, b) @@ -38,14 +38,14 @@ end b = random_dna_kmer(len) test_matches(DNAKmer(a), DNAKmer(b)) test_matches(Kmer{DNAAlphabet{4}}(a), Kmer{DNAAlphabet{4}}(b)) - + a = random_rna_kmer(len) b = random_rna_kmer(len) test_matches(RNAKmer(a), RNAKmer(b)) test_matches(Kmer{RNAAlphabet{4}}(a), Kmer{RNAAlphabet{4}}(b)) - + a = AAKmer(random_aa(len)) b = AAKmer(random_aa(len)) test_matches(a, b) end -end \ No newline at end of file +end diff --git a/test/order.jl b/test/order.jl index 8b08201..64dc243 100644 --- a/test/order.jl +++ b/test/order.jl @@ -1,7 +1,15 @@ @testset "Order" begin @test DNAMer("AA") < DNAMer("AC") < DNAMer("AG") < DNAMer("AT") < DNAMer("CA") @test RNAMer("AA") < RNAMer("AC") < RNAMer("AG") < RNAMer("AU") < RNAMer("CA") - - @test BigDNAMer("AA") < BigDNAMer("AC") < BigDNAMer("AG") < BigDNAMer("AT") < BigDNAMer("CA") - @test BigRNAMer("AA") < BigRNAMer("AC") < BigRNAMer("AG") < BigRNAMer("AU") < BigRNAMer("CA") + + @test BigDNAMer("AA") < + BigDNAMer("AC") < + BigDNAMer("AG") < + BigDNAMer("AT") < + BigDNAMer("CA") + @test BigRNAMer("AA") < + BigRNAMer("AC") < + BigRNAMer("AG") < + BigRNAMer("AU") < + BigRNAMer("CA") end diff --git a/test/print.jl b/test/print.jl index 9e8f486..b96a80c 100644 --- a/test/print.jl +++ b/test/print.jl @@ -6,13 +6,13 @@ print(buf, RNAKmer("ACGU")) @test String(take!(buf)) == "ACGU" - + print(buf, Kmer{DNAAlphabet{4}}("ACGT")) @test String(take!(buf)) == "ACGT" print(buf, Kmer{RNAAlphabet{4}}("ACGU")) @test String(take!(buf)) == "ACGU" - + print(buf, AAKmer("AMVKFPSMT")) @test String(take!(buf)) == "AMVKFPSMT" end @@ -22,16 +22,16 @@ end show(buf, DNAKmer("AGAGT")) @test String(take!(buf)) == "AGAGT" - + show(buf, RNAKmer("AGAGU")) @test String(take!(buf)) == "AGAGU" - + show(buf, Kmer{DNAAlphabet{4}}("AGAGT")) @test String(take!(buf)) == "AGAGT" - + show(buf, Kmer{RNAAlphabet{4}}("AGAGU")) @test String(take!(buf)) == "AGAGU" - + print(buf, AAKmer("AMVKFPSMT")) @test String(take!(buf)) == "AMVKFPSMT" end diff --git a/test/runtests.jl b/test/runtests.jl index 4260278..0dc2c86 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,7 @@ module TestKmers using Kmers -using BioSequences +using BioSequences using Test const GROUP = get(ENV, "GROUP", "All") diff --git a/test/transformations.jl b/test/transformations.jl index 1691688..e16c310 100644 --- a/test/transformations.jl +++ b/test/transformations.jl @@ -25,13 +25,13 @@ end @testset "Reverse" begin - for len in 1:64, _ in 1:10 + for len in 1:64, _ in 1:10 test_reverse(DNAKmer{len}, random_dna_kmer(len)) test_reverse(RNAKmer{len}, random_rna_kmer(len)) end seq = dna"AAAAAAAAAAAAAAAAAAAAAAAAAAAAGATAC" - @test reverse(seq[(length(seq)-9):length(seq)]) == dna"CATAGAAAAA" + @test reverse(seq[(length(seq) - 9):length(seq)]) == dna"CATAGAAAAA" end @testset "Complement" begin @@ -47,14 +47,14 @@ test_rna_revcomp(RNAKmer{len}, random_rna_kmer(len)) end end - + @testset "Canonical" begin - @test canonical(DNAKmer{4,1}("ACCG")) == DNAKmer{4,1}("ACCG") - @test canonical(DNAKmer{4,1}("GCAC")) == DNAKmer{4,1}("GCAC") - @test canonical(RNAKmer{4,1}("AAUU")) == RNAKmer{4,1}("AAUU") - @test canonical(RNAKmer{4,1}("UGGA")) == RNAKmer{4,1}("UCCA") - @test canonical(RNAKmer{4,1}("CGAU")) == RNAKmer{4,1}("AUCG") - @test canonical(RNAKmer{4,1}("UGGA")) == RNAKmer{4,1}("UCCA") - @test canonical(DNAKmer{4,1}("GCAC")) == DNAKmer{4,1}("GCAC") + @test canonical(DNAKmer{4, 1}("ACCG")) == DNAKmer{4, 1}("ACCG") + @test canonical(DNAKmer{4, 1}("GCAC")) == DNAKmer{4, 1}("GCAC") + @test canonical(RNAKmer{4, 1}("AAUU")) == RNAKmer{4, 1}("AAUU") + @test canonical(RNAKmer{4, 1}("UGGA")) == RNAKmer{4, 1}("UCCA") + @test canonical(RNAKmer{4, 1}("CGAU")) == RNAKmer{4, 1}("AUCG") + @test canonical(RNAKmer{4, 1}("UGGA")) == RNAKmer{4, 1}("UCCA") + @test canonical(DNAKmer{4, 1}("GCAC")) == DNAKmer{4, 1}("GCAC") end end diff --git a/test/translation.jl b/test/translation.jl index ccf34dc..38d8cdb 100644 --- a/test/translation.jl +++ b/test/translation.jl @@ -1,141 +1,138 @@ @testset "Translation" begin - -sampler = BioSequences.SamplerWeighted( - dna"ACGTMRSVWYHKDBN", - vcat(fill(0.225, 4), fill(0.00909, 10)) -) - -for A in (RNAAlphabet, DNAAlphabet) - for N in (2, 4) - for len in [3, 15, 33, 66] - for alternative in (true, false) - seq = if N == 2 - randseq(A{2}(), len) - else - randseq(A{4}(), sampler, len) + sampler = BioSequences.SamplerWeighted( + dna"ACGTMRSVWYHKDBN", + vcat(fill(0.225, 4), fill(0.00909, 10)), + ) + + for A in (RNAAlphabet, DNAAlphabet) + for N in (2, 4) + for len in [3, 15, 33, 66] + for alternative in (true, false) + seq = if N == 2 + randseq(A{2}(), len) + else + randseq(A{4}(), sampler, len) + end + kmer = Kmer{A{N}}(seq) + @test ( + translate(seq; alternative_start=alternative) == + translate(kmer; alternative_start=alternative) + ) end - kmer = Kmer{A{N}}(seq) - @test ( - translate(seq, alternative_start=alternative) == - translate(kmer, alternative_start=alternative) - ) end end end -end -# Throws when ambiguous -@test_throws Exception translate( - Kmer{RNAAlphabet{4}}("AUGCCGCMA"), - allow_ambiguous_codons=false -) + # Throws when ambiguous + @test_throws Exception translate( + Kmer{RNAAlphabet{4}}("AUGCCGCMA"), + allow_ambiguous_codons=false, + ) + + # Not divisible by 3 + @test_throws Exception translate(mer"UG"r) + @test_throws Exception translate(mer"TAGCTTAA"d) + @test_throws Exception translate(mer"CUGUAGUUGUCGC"r) + @test_throws Exception translate(mer"AGCGA"d) + + # Cannot transla AA seq + @test_throws MethodError translate(mer"LLVM"aa) + @test_throws MethodError translate(mer"ATG"aa) +end # translation + +@testset "CodonSet" begin + CodonSet = Kmers.CodonSet -# Not divisible by 3 -@test_throws Exception translate(mer"UG"r) -@test_throws Exception translate(mer"TAGCTTAA"d) -@test_throws Exception translate(mer"CUGUAGUUGUCGC"r) -@test_throws Exception translate(mer"AGCGA"d) + SAMPLE_SOURCES = Any[ + [mer"UAG"r, mer"ACC"r, mer"ACC"r, mer"UGG"r], + RNACodon[], + [mer"AAA"r, mer"ACC"r, mer"AAA"r, mer"UCA"r, mer"UCC"r], + (i for i in (mer"AGC"r, mer"AGA"r, mer"UUU"r)), + (mer"AAC"r, mer"AGG"r), + (mer"UUG"r,), + ] -# Cannot transla AA seq -@test_throws MethodError translate(mer"LLVM"aa) -@test_throws MethodError translate(mer"ATG"aa) + @testset "Construction and basics" begin + @test isempty(CodonSet()) -end # translation + # Constuct the sets and basic properties + for codons in SAMPLE_SOURCES + set = Set(codons) + codonset = CodonSet(codons) + @test issetequal(set, codonset) + @test length(codonset) == length(set) + end -@testset "CodonSet" begin -CodonSet = Kmers.CodonSet - -SAMPLE_SOURCES = Any[ - [mer"UAG"r, mer"ACC"r, mer"ACC"r, mer"UGG"r], - RNACodon[], - [mer"AAA"r, mer"ACC"r, mer"AAA"r, mer"UCA"r, mer"UCC"r], - (i for i in (mer"AGC"r, mer"AGA"r, mer"UUU"r)), - (mer"AAC"r, mer"AGG"r), - (mer"UUG"r,), -] - -@testset "Construction and basics" begin - @test isempty(CodonSet()) - - # Constuct the sets and basic properties - for codons in SAMPLE_SOURCES - set = Set(codons) - codonset = CodonSet(codons) - @test issetequal(set, codonset) - @test length(codonset) == length(set) + # Fails with non-codons + @test_throws MethodError CodonSet([(RNA_A, RNA_G)]) + @test_throws MethodError CodonSet((mer"UA"r,)) + @test_throws MethodError CodonSet([rna"AGG", rna"GGG"]) + @test_throws MethodError CodonSet([1, 2, 3]) end - # Fails with non-codons - @test_throws MethodError CodonSet([(RNA_A, RNA_G)]) - @test_throws MethodError CodonSet((mer"UA"r,)) - @test_throws MethodError CodonSet([rna"AGG", rna"GGG"]) - @test_throws MethodError CodonSet([1,2,3]) -end + SAMPLE_CODONSETS = map(CodonSet, SAMPLE_SOURCES) -SAMPLE_CODONSETS = map(CodonSet, SAMPLE_SOURCES) + @testset "Iteration" begin + for things in SAMPLE_SOURCES + @test sort!(collect(CodonSet(things))) == sort!(collect(Set(things))) + end -@testset "Iteration" begin - for things in SAMPLE_SOURCES - @test sort!(collect(CodonSet(things))) == sort!(collect(Set(things))) + @test iterate(CodonSet()) === nothing + codonset = CodonSet((mer"UUU"r,)) + codon, state = iterate(codonset) + @test codon == mer"UUU"r + @test iterate(codonset, state) === nothing end - @test iterate(CodonSet()) === nothing - codonset = CodonSet((mer"UUU"r,)) - codon, state = iterate(codonset) - @test codon == mer"UUU"r - @test iterate(codonset, state) === nothing -end - -@testset "Membership" begin - codonset = CodonSet([mer"ACC"r, mer"UAG"r, mer"UUU"r]) - @test mer"ACC"r in codonset - @test mer"UAG"r in codonset - @test mer"UUU"r in codonset - @test !in(mer"GAA"r, codonset) - @test !in(mer"AAA"r, codonset) -end - -@testset "Modifying" begin - # Push - s1 = CodonSet([mer"GGA"r, mer"UGU"r]) - s2 = push(s1, mer"GGA"r) - @test s1 == s2 - s3 = push(s2, mer"GAG"r) - @test Set(s3) == Set([mer"GGA"r, mer"UGU"r, mer"GGA"r, mer"GAG"r]) - - # Delete - s4 = delete(s3, mer"GAG"r) - @test s2 == s4 - s5 = delete(s4, mer"UGU"r) - @test only(s5) == mer"GGA"r - s6 = delete(s5, mer"UUU"r) - @test s5 == s6 - s7 = delete(s6, mer"GGA"r) - @test isempty(s7) -end - -@testset "Set operations" begin - for c1 in SAMPLE_CODONSETS, c2 in SAMPLE_CODONSETS - s1, s2 = Set(c1), Set(c2) - for operation in [union, intersect, setdiff, symdiff] - @test Set(operation(c1, c2)) == operation(s1, s2) - end - @test issubset(c1, c2) == issubset(s1, s2) + @testset "Membership" begin + codonset = CodonSet([mer"ACC"r, mer"UAG"r, mer"UUU"r]) + @test mer"ACC"r in codonset + @test mer"UAG"r in codonset + @test mer"UUU"r in codonset + @test !in(mer"GAA"r, codonset) + @test !in(mer"AAA"r, codonset) end -end - -@testset "Filter" begin - predicates = [ - (i -> i[2] == RNA_G), - (i -> isodd(length(i))), # always true for codons - (i -> i[1] == i[3]), - (i -> i[2] != RNA_A) - ] - for codonset in SAMPLE_CODONSETS, predicate in predicates - @test Set(filter(predicate, codonset)) == filter(predicate, Set(codonset)) + + @testset "Modifying" begin + # Push + s1 = CodonSet([mer"GGA"r, mer"UGU"r]) + s2 = push(s1, mer"GGA"r) + @test s1 == s2 + s3 = push(s2, mer"GAG"r) + @test Set(s3) == Set([mer"GGA"r, mer"UGU"r, mer"GGA"r, mer"GAG"r]) + + # Delete + s4 = delete(s3, mer"GAG"r) + @test s2 == s4 + s5 = delete(s4, mer"UGU"r) + @test only(s5) == mer"GGA"r + s6 = delete(s5, mer"UUU"r) + @test s5 == s6 + s7 = delete(s6, mer"GGA"r) + @test isempty(s7) end -end + @testset "Set operations" begin + for c1 in SAMPLE_CODONSETS, c2 in SAMPLE_CODONSETS + s1, s2 = Set(c1), Set(c2) + for operation in [union, intersect, setdiff, symdiff] + @test Set(operation(c1, c2)) == operation(s1, s2) + end + @test issubset(c1, c2) == issubset(s1, s2) + end + end + + @testset "Filter" begin + predicates = [ + (i -> i[2] == RNA_G), + (i -> isodd(length(i))), # always true for codons + (i -> i[1] == i[3]), + (i -> i[2] != RNA_A), + ] + for codonset in SAMPLE_CODONSETS, predicate in predicates + @test Set(filter(predicate, codonset)) == filter(predicate, Set(codonset)) + end + end end # CodonSet @testset "Reverse translation" begin @@ -143,7 +140,7 @@ end # CodonSet code2 = ReverseGeneticCode(BioSequences.trematode_mitochondrial_genetic_code) for (rvcode, fwcode) in [ (Kmers.rev_standard_genetic_code, BioSequences.standard_genetic_code), - (code2, BioSequences.trematode_mitochondrial_genetic_code) + (code2, BioSequences.trematode_mitochondrial_genetic_code), ] @test reverse_translate(aa"", rvcode) == CodonSet[] observed = Dict{AminoAcid, CodonSet}() @@ -155,7 +152,7 @@ end # CodonSet # Length and iteration of ReverseGeneticCode @test length(rvcode) == length(symbols(AminoAcidAlphabet())) - 1 # all but AA_Gap - @test sort!(collect(rvcode), by=first) == sort!(collect(observed), by=first) + @test sort!(collect(rvcode); by=first) == sort!(collect(observed); by=first) flipped = Dict(v => k for (k, v) in observed) for (codonset, aa) in flipped @@ -181,13 +178,34 @@ end # CodonSet (AA_J, [AA_I, AA_L]), (AA_Z, [AA_E, AA_Q]), (AA_B, [AA_D, AA_N]), - (AA_X, [ - AA_A, AA_R, AA_N, AA_D, AA_C, AA_Q, AA_E, AA_G, AA_H, AA_I, AA_L, AA_K, - AA_M, AA_F, AA_P, AA_S, AA_T, AA_W, AA_Y, AA_V - ]) - ] + ( + AA_X, + [ + AA_A, + AA_R, + AA_N, + AA_D, + AA_C, + AA_Q, + AA_E, + AA_G, + AA_H, + AA_I, + AA_L, + AA_K, + AA_M, + AA_F, + AA_P, + AA_S, + AA_T, + AA_W, + AA_Y, + AA_V, + ], + ), + ] c1 = only(reverse_translate(LongAA([ambig]), rvcode)) - c2 = foldl(elements, init=CodonSet()) do old, aa + c2 = foldl(elements; init=CodonSet()) do old, aa union(old, reverse_translate(aa, rvcode)) end @test c1 == c2 diff --git a/test/utils.jl b/test/utils.jl index a2d74b3..9dffd1b 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,5 +1,5 @@ # Return a random DNA/RNA sequence of the given length. -function random_seq(n::Integer, nts, probs, outtype = String) +function random_seq(n::Integer, nts, probs, outtype=String) cumprobs = cumsum(probs) x = Vector{Char}(undef, n) for i in 1:n @@ -8,7 +8,7 @@ function random_seq(n::Integer, nts, probs, outtype = String) return outtype(x) end -function random_seq(::Type{A}, n::Integer) where {A<:Alphabet} +function random_seq(::Type{A}, n::Integer) where {A <: Alphabet} # TODO: Resolve the use of symbols(A()). nts = symbols(A()) probs = Vector{Float64}(undef, length(nts)) @@ -25,10 +25,33 @@ function random_rna(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) end function random_aa(len) - return random_seq(len, - ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', - 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'X' ], - push!(fill(0.049, 20), 0.02)) + return random_seq( + len, + [ + 'A', + 'R', + 'N', + 'D', + 'C', + 'Q', + 'E', + 'G', + 'H', + 'I', + 'L', + 'K', + 'M', + 'F', + 'P', + 'S', + 'T', + 'W', + 'Y', + 'V', + 'X', + ], + push!(fill(0.049, 20), 0.02), + ) end function random_dna_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) @@ -44,8 +67,34 @@ function random_rna_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) end function random_aa_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) - return random_seq(n, ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', - 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'X' ], probs, Vector{AminoAcid}) + return random_seq( + n, + [ + 'A', + 'R', + 'N', + 'D', + 'C', + 'Q', + 'E', + 'G', + 'H', + 'I', + 'L', + 'K', + 'M', + 'F', + 'P', + 'S', + 'T', + 'W', + 'Y', + 'V', + 'X', + ], + probs, + Vector{AminoAcid}, + ) end function random_dna_kmer(len) @@ -72,4 +121,4 @@ function rna_complement(seq::AbstractString) seqc[i] = complementer[c] end return String(seqc) -end \ No newline at end of file +end From 4abc5e9b2967269c6a16118807e3c43aded00a29 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 25 Jun 2023 20:41:39 +0200 Subject: [PATCH 02/33] Random cleanup --- .gitignore | 3 +- src/Kmers.jl | 21 +++---- src/kmer.jl | 133 +++++++++++++++++---------------------- src/tuple_bitflipping.jl | 24 +++---- 4 files changed, 78 insertions(+), 103 deletions(-) diff --git a/.gitignore b/.gitignore index 02e1207..8cb4c0b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ *.jl.*.cov *.jl.mem .DS_Store -Manifest.toml \ No newline at end of file +Manifest.toml +TODO.md diff --git a/src/Kmers.jl b/src/Kmers.jl index f08aa19..a9b76f5 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -6,8 +6,6 @@ # This file is a part of the Kmers.jl, a package in the BioJulia ecosystem. # License is MIT: https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE -__precompile__() - module Kmers export @@ -128,22 +126,23 @@ export ### ### Sequence literals ### - @mer_str, @bigmer_str using BioSequences -ispermitted(::DNAAlphabet{2}, nt::DNA) = count_ones(nt) == 1 && isvalid(nt) -ispermitted(::DNAAlphabet{2}, data::UInt) = data < UInt(4) -ispermitted(::DNAAlphabet{4}, nt::DNA) = isvalid(nt) -ispermitted(::DNAAlphabet{4}, data::UInt) = isvalid(DNA, data) -ispermitted(::AminoAcidAlphabet, aa::AminoAcid) = - reinterpret(UInt8, aa) <= reinterpret(UInt8, AA_Gap) -ispermitted(::AminoAcidAlphabet, data::UInt) = data <= 0x1b +""" + Kmers.Unsafe -include("kmer.jl") +Trait object used to access unsafe methods of functions. +`unsafe` is the singleton of `Unsafe`. +""" +struct Unsafe end +const unsafe = Unsafe() +include("tuple_bitflipping.jl") +include("kmer.jl") +include("revtrans.jl") include("kmer_iteration/AbstractKmerIterator.jl") include("kmer_iteration/EveryKmer.jl") include("kmer_iteration/SpacedKmers.jl") diff --git a/src/kmer.jl b/src/kmer.jl index b956783..5b26c58 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -1,75 +1,87 @@ -### -### Kmer Type definition -### - -# Include some basic tuple bitflipping ops - the secret sauce to efficiently -# manipping Kmer's static data. -include("tuple_bitflipping.jl") - -""" - Kmers.Unsafe - -Trait object used to access unsafe methods of functions. -`unsafe` is the singleton of `Unsafe`. -""" -struct Unsafe end -const unsafe = Unsafe() +# Notes about Kmers's representation: +# Each element is encoded in the same way as a LongSequence, however the order +# is different. In a Kmer, the elements fill from MSB to LSB, from first to +# last tuple index. Unused bits are always zeroed. +# This layout complicates some Kmer construction code, but simplifies comparison +# operators, and we really want Kmers to be efficient. """ Kmer{A<:Alphabet,K,N} <: BioSequence{A} -A parametric, immutable, bitstype for representing Kmers - short sequences. -Given the number of Kmers generated from raw sequencing reads, avoiding -repetetive memory allocation and triggering of garbage collection is important, -as is the ability to effectively pack Kmers into arrays and similar collections. - -In practice that means we an immutable bitstype as the internal representation -of these sequences. Thankfully, this is not much of a limitation - kmers are -rarely manipulated and so by and large don't have to be mutable. +A parametric, immutable, bitstype for representing k-mers - short sequences +of a fixed length K. -Excepting their immutability, they fulfill the rest of the API and behaviours -expected from a concrete `BioSequence` type, and non-mutating transformations -of the type are still defined. - -!!! warning - Given their immutability, `setindex` and mutating sequence transformations - are not implemented for Kmers e.g. `reverse_complement!`. -!!! tip - Note that some sequence transformations that are not mutating are - available, since they can return a new kmer value as a result e.g. - `reverse_complement`. +Since they can be stored directly in registers, `Kmer`s are generally the most +efficient type of `BioSequence`, when `K` is small and known at compile time. """ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} - data::NTuple{N, UInt64} + data::NTuple{N, UInt} # This unsafe method do not clip the head - Kmer{A, K, N}(::Unsafe, data::NTuple{N, UInt64}) where {A <: Alphabet, K, N} = + function Kmer{A, K, N}(::Unsafe, data::NTuple{N, UInt}) where {A <: Alphabet, K, N} new{A, K, N}(data) + end - function Kmer{A, K, N}(data::NTuple{N, UInt64}) where {A <: Alphabet, K, N} + function Kmer{A, K, N}(data::NTuple{N, UInt}) where {A <: Alphabet, K, N} checkmer(Kmer{A, K, N}) x = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) return new(_cliphead(x, data...)) end end +""" + checkmer(::Type{Kmer{A,K,N}}) where {A,K,N} + +Internal method - enforces good kmer type parameterisation. + +For a given Kmer{A,K,N} of length K, the number of words used to +represent it (N) should be the minimum needed to contain all K symbols. + +This function should compile to a noop in case the parameterization is good. +""" +@inline function checkmer(::Type{Kmer{A, K, N}}) where {A, K, N} + if !(K isa Int) + throw(ArgumentError("K must be an Int")) + elseif K < 1 + throw(ArgumentError("Bad kmer parameterisation. K must be greater than 0.")) + end + n = cld((K * BioSequences.bits_per_symbol(A())) % UInt, (sizeof(UInt) * 8) % UInt) % Int + if !(N isa Int) + throw(ArgumentError("N must be an Int")) + elseif n !== N + # This has been significantly changed conceptually from before. Now we + # don't just check K, but *enforce* the most appropriate N for K. + throw(ArgumentError("Bad kmer parameterisation. For K = $K, N should be $n")) + end +end + +function Kmer{A, K, N}(itr) where {A, K, N} + Kmer{A, K, N}(Base.IteratorSize(itr), itr) +end + +function Kmer{A, K, N}(::Base.SizeUnknown, itr) where {A, K, N} + Kmer{A, K, N}(collect(itr)) +end + +function Kmer{A, K, N}(::Union{Base.HasShape, Base.HasLength}, itr) where {A, K, N} +end + BioSequences.encoded_data(seq::Kmer{A, K, N}) where {A, K, N} = seq.data # Create a blank ntuple of appropriate length for a given Kmer with N. -@inline blank_ntuple(::Type{Kmer{A, K, N}}) where {A, K, N} = +@inline function blank_ntuple(::Type{Kmer{A, K, N}}) where {A, K, N} ntuple(x -> zero(UInt64), Val{N}()) +end ### ### _build_kmer_data ### -#= -These are (hopefully!) very optimised kernel functions for building kmer internal -data from individual elements or from sequences. Kmers themselves are static, -tuple-based structs, and so I really didn't want these functions to create memory -allocations or GC activity through use of vectors an such, for what should be -the creation of a single, rather simple value. -=# +# These are (hopefully!) very optimised kernel functions for building kmer internal +# data from individual elements or from sequences. Kmers themselves are static, +# tuple-based structs, and so I really didn't want these functions to create memory +# allocations or GC activity through use of vectors an such, for what should be +# the creation of a single, rather simple value. """ _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} @@ -400,10 +412,6 @@ const DNACodon = DNAKmer{3, 1} "Shorthand for `RNAKmer{3,1}`" const RNACodon = RNAKmer{3, 1} -### -### Base Functions -### - @inline ksize(::Type{Kmer{A, K, N}}) where {A, K, N} = K @inline nsize(::Type{Kmer{A, K, N}}) where {A, K, N} = N @inline per_word_capacity(::Type{Kmer{A, K, N}}) where {A, K, N} = @@ -418,30 +426,7 @@ const RNACodon = RNAKmer{3, 1} per_word_capacity(Kmer{A, K, N}) - n_unused(Kmer{A, K, N}) @inline elements_in_head(seq::Kmer) = elements_in_head(typeof(seq)) -""" - checkmer(::Type{Kmer{A,K,N}}) where {A,K,N} - -Internal method - enforces good kmer type parameterisation. - -For a given Kmer{A,K,N} of length K, the number of words used to -represent it (N) should be the minimum needed to contain all K symbols, -no larger (wasteful) no smaller (just... wrong). -Because it is used on type parameters / variables, these conditions should be -checked at compile time, and the branches / error throws eliminated when the -parameterisation of the Kmer type is good. -""" -@inline function checkmer(::Type{Kmer{A, K, N}}) where {A, K, N} - if K < 1 - throw(ArgumentError("Bad kmer parameterisation. K must be greater than 0.")) - end - n = BioSequences.seq_data_len(A, K) - if n !== N - # This has been significantly changed conceptually from before. Now we - # don't just check K, but *enforce* the most appropriate N for K. - throw(ArgumentError("Bad kmer parameterisation. For K = $K, N should be $n")) - end -end @inline Base.length(x::Kmer{A, K, N}) where {A, K, N} = K @inline Base.summary(x::Kmer{A, K, N}) where {A, K, N} = string(eltype(x), ' ', K, "-mer") @@ -597,4 +582,4 @@ macro mer_str(seq) return T(seq′) end -include("revtrans.jl") + diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index c31c338..3bbe025 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -26,15 +26,13 @@ Notably it's used when constructing a Kmer from an existing NTuple of UInt64 return (head & (typemax(UInt64) >> by), tail...) end -#= -rightshift_carry & leftshift_carry +# rightshift_carry & leftshift_carry -These methods are micro-optimised (or should be!!!) for shifting the bits in -an NTuple of unsigned integers, carrying the bits "shifted off" one word -over to the next word. The carry can also be "seeded" so as other methods like -pushfirst and pushlast can be efficiently implemented without duplication of code -or less efficient implementations that first shift and then insert an element. -=# +# These methods are micro-optimised (or should be!!!) for shifting the bits in +# an NTuple of unsigned integers, carrying the bits "shifted off" one word +# over to the next word. The carry can also be "seeded" so as other methods like +# pushfirst and pushlast can be efficiently implemented without duplication of code +# or less efficient implementations that first shift and then insert an element. @inline function rightshift_carry( x::NTuple{N, UInt64}, @@ -81,12 +79,4 @@ end return (_reverse(bpe, tail...)..., BioSequences.reversebits(head, bpe)) end -@inline _reverse(::BioSequences.BitsPerSymbol{N}) where {N} = () - -#= -@inline function _reverse(f::F, bpe::BioSequences.BitsPerSymbol{N}, head::UInt64, tail...) where {N,F<:Function} - return (_reverse(f, bpe, tail...)..., f(reversebits(head, bpe))) -end - -@inline _reverse(f::F, ::BioSequences.BitsPerSymbol{N}) where {N,F<:Function} = () -=# +@inline _reverse(::BioSequences.BitsPerSymbol{N}) where {N} = () \ No newline at end of file From 45a9f38840a4bdca1ae08d27ab60645ee2100f80 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Thu, 20 Jul 2023 15:54:47 +0200 Subject: [PATCH 03/33] More fixups - squash --- src/Kmers.jl | 3 + src/indexing.jl | 44 +-- src/kmer.jl | 633 ++++++++++----------------------------- src/tuple_bitflipping.jl | 62 +++- 4 files changed, 222 insertions(+), 520 deletions(-) diff --git a/src/Kmers.jl b/src/Kmers.jl index a9b76f5..9592284 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -142,11 +142,14 @@ const unsafe = Unsafe() include("tuple_bitflipping.jl") include("kmer.jl") +include("indexing.jl") +#= include("revtrans.jl") include("kmer_iteration/AbstractKmerIterator.jl") include("kmer_iteration/EveryKmer.jl") include("kmer_iteration/SpacedKmers.jl") include("kmer_iteration/EveryCanonicalKmer.jl") include("kmer_iteration/SpacedCanonicalKmers.jl") +=# end # module diff --git a/src/indexing.jl b/src/indexing.jl index 1221466..a7f99e2 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -1,38 +1,10 @@ -@inline BioSequences.encoded_data_eltype(::Type{<:Kmer}) = UInt64 - -@inline function BioSequences.extract_encoded_element(seq::Kmer, i::Integer) - bi = BioSequences.bitindex(seq, i % UInt) - return BioSequences.extract_encoded_element(bi, seq.data) +@inline function BioSequences.extract_encoded_element(seq::Kmer{A}, i::Integer) where A + T = typeof(seq) + bps = BioSequences.bits_per_symbol(A()) % UInt + index = div((i + n_unused(T) - 1) % UInt, per_word_capacity(T) % UInt) + 1 + offset = mod(((elements_in_head(T) - i) * bps) % UInt, 8 * sizeof(UInt)) + mask = UInt(1) << bps - 1 + right_shift(@inbounds(seq.data[index]), offset) & mask end -@inline Base.copy(seq::Kmer) = typeof(seq)(seq.data) - -@inline encoded_data(x::Kmer) = x.data - -@inline BioSequences.bitindex(seq::Kmer, i::Integer) = BioSequences.bitindex( - BioSequences.BitsPerSymbol(seq), - BioSequences.encoded_data_eltype(typeof(seq)), - i + n_unused(seq), -) - -""" -Base.getindex(seq::Kmer, i::UnitRange) - -Slice a Kmer by a UnitRange. - -!!! warning - Using this function will introduce performance penalties in your code if - you pass values of `i` that are not constants that can be propagated. -""" -@inline function Base.getindex(seq::Kmer{A}, i::UnitRange) where {A} - @boundscheck Base.checkbounds(seq, i) - ind(s, i) = BioSequences.index(BioSequences.bitindex(s, i)) - off(s, i) = BioSequences.offset(BioSequences.bitindex(s, i)) - isempty(i) && return Kmer{A, 0, 0}(()) - rshift = (64 - off(seq, last(i) + 1)) & 63 - stop = ind(seq, last(i)) - start = BioSequences.index(BioSequences.bitindex(seq, first(i)) + rshift) - data = Kmers.rightshift_carry(seq.data, rshift) - T = Kmers.kmertype(Kmer{A, length(i)}) - return T(data[start:stop]) -end +# TODO: Index with range, index with bitvector \ No newline at end of file diff --git a/src/kmer.jl b/src/kmer.jl index 5b26c58..d3b3923 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -15,22 +15,46 @@ Since they can be stored directly in registers, `Kmer`s are generally the most efficient type of `BioSequence`, when `K` is small and known at compile time. """ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} + # The number of UInt is always exactly the number needed, no less, no more. + # The first symbols pack into the first UInts + # An UInt with N elements pack into the lowest bits of the UInt, with the + # first symbols in the higher parts of the UInt. + # Hence, a sequence A-G of 16-bit elements would pack like: + # ( ABC, DEFG) + # ^ 16 unused bits, the unused bits are always top bits of first UInt data::NTuple{N, UInt} # This unsafe method do not clip the head function Kmer{A, K, N}(::Unsafe, data::NTuple{N, UInt}) where {A <: Alphabet, K, N} + check_kmer(Kmer{A, K, N}) new{A, K, N}(data) end function Kmer{A, K, N}(data::NTuple{N, UInt}) where {A <: Alphabet, K, N} - checkmer(Kmer{A, K, N}) + check_kmer(Kmer{A, K, N}) x = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) - return new(_cliphead(x, data...)) + return new(cliphead(x, data...)) end end +# Aliases +"Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`" +const DNAKmer{K, N} = Kmer{DNAAlphabet{2}, K, N} + +"Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`" +const RNAKmer{K, N} = Kmer{RNAAlphabet{2}, K, N} + +"Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`" +const AAKmer{K, N} = Kmer{AminoAcidAlphabet, K, N} + +"Shorthand for `DNAKmer{3,1}`" +const DNACodon = DNAKmer{3, 1} + +"Shorthand for `RNAKmer{3,1}`" +const RNACodon = RNAKmer{3, 1} + """ - checkmer(::Type{Kmer{A,K,N}}) where {A,K,N} + check_kmer(::Type{Kmer{A,K,N}}) where {A,K,N} Internal method - enforces good kmer type parameterisation. @@ -39,7 +63,7 @@ represent it (N) should be the minimum needed to contain all K symbols. This function should compile to a noop in case the parameterization is good. """ -@inline function checkmer(::Type{Kmer{A, K, N}}) where {A, K, N} +@inline function check_kmer(::Type{Kmer{A, K, N}}) where {A, K, N} if !(K isa Int) throw(ArgumentError("K must be an Int")) elseif K < 1 @@ -55,531 +79,174 @@ This function should compile to a noop in case the parameterization is good. end end -function Kmer{A, K, N}(itr) where {A, K, N} - Kmer{A, K, N}(Base.IteratorSize(itr), itr) -end +################################################ +# Compile-time functions computed on Kmer types +################################################ -function Kmer{A, K, N}(::Base.SizeUnknown, itr) where {A, K, N} - Kmer{A, K, N}(collect(itr)) -end +@inline ksize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = K +@inline nsize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = N +@inline n_unused(::Type{<:Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K -function Kmer{A, K, N}(::Union{Base.HasShape, Base.HasLength}, itr) where {A, K, N} +@inline function n_coding_elements(::Type{<:Kmer{A, K}}) where {A, K} + cld(BioSequences.bits_per_symbol(A()) * K, 8 * sizeof(UInt)) end -BioSequences.encoded_data(seq::Kmer{A, K, N}) where {A, K, N} = seq.data - -# Create a blank ntuple of appropriate length for a given Kmer with N. -@inline function blank_ntuple(::Type{Kmer{A, K, N}}) where {A, K, N} - ntuple(x -> zero(UInt64), Val{N}()) +@inline function per_word_capacity(::Type{<:Kmer{A}}) where A + div(8 * sizeof(UInt), BioSequences.bits_per_symbol(A())) end -### -### _build_kmer_data -### - -# These are (hopefully!) very optimised kernel functions for building kmer internal -# data from individual elements or from sequences. Kmers themselves are static, -# tuple-based structs, and so I really didn't want these functions to create memory -# allocations or GC activity through use of vectors an such, for what should be -# the creation of a single, rather simple value. - -""" - _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} - -Construct a ntuple of the bits data for an instance of a Kmer{A,K,N}. - -This particular method is specialised for LongSequences, and for when the Kmer -and LongSequence types used, share the same alphabet, since a lot of encoding / -decoding can be skipped, and the problem is mostly one of shunting bits around. -""" -@inline function _build_kmer_data( - ::Type{Kmer{A, K, N}}, - seq::LongSequence{A}, - from::Int=1, -) where {A, K, N} - checkmer(Kmer{A, K, N}) - - bits_per_sym = BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - n_head = elements_in_head(Kmer{A, K, N}) # Based on kmer type, should constant fold. - n_per_chunk = per_word_capacity(Kmer{A, K, N}) # Based on kmer type, should constant fold. - - if from + K - 1 > length(seq) - return nothing - end - - # Construct the head. - head = zero(UInt64) - @inbounds for i in from:(from + n_head - 1) - bits = UInt64(BioSequences.extract_encoded_element(seq, i)) - head = (head << bits_per_sym) | bits - end - - # And the rest of the sequence - idx = Ref(from + n_head) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for _ in 1:n_per_chunk - bits = UInt64(BioSequences.extract_encoded_element(seq, idx[])) - body = (body << bits_per_sym) | bits - idx[] += 1 - end - return body - end - - # Put head and tail together - return (head, tail...) +@inline function capacity(::Type{<:Kmer{A, K, N}}) where {A, K, N} + per_word_capacity(Kmer{A, K, N}) * N end -### -### Constructors -### - -""" - Kmer{A,K,N}(itr) where {A,K,N} - -Construct a `Kmer{A,K,N}` from an iterable. - -The most generic constructor. - -Currently the iterable must have `length` & support `getindex` with integers. - -# Examples - -```jldoctest -julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet -5nt DNA Sequence: -TTAGC - -julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet -DNA 5-mer: -TTAGC -``` -""" -function Kmer{A, K, N}(itr) where {A, K, N} - checkmer(Kmer{A, K, N}) - - seqlen = length(itr) - if seqlen != K - throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)")) - end - - ## All based on alphabet type of Kmer, so should constant fold. - bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A, K, N}) - n_per_chunk = per_word_capacity(Kmer{A, K, N}) - - # Construct the head. - head = zero(UInt64) - @inbounds for i in 1:n_head - (x, next_i) = iterate(itr, i) - sym = convert(eltype(Kmer{A, K, N}), x) - # Encode will throw if it cant encode an element. - head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) - end - - # And the rest of the sequence - idx = Ref(n_head + 1) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for i in 1:n_per_chunk - (x, next_idx) = iterate(itr, idx[]) - sym = convert(eltype(Kmer{A, K, N}), x) - # Encode will throw if it cant encode an element. - body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) - idx[] += 1 - end - return body - end - - data = (head, tail...) - - return Kmer{A, K, N}(data) +@inline function elements_in_head(::Type{<:Kmer{A, K, N}}) where {A, K, N} + per_word_capacity(Kmer{A, K, N}) - n_unused(Kmer{A, K, N}) end -""" - Kmer{A,K,N}(seq::BioSequence{A}) - -Construct a `Kmer{A,K,N}` from a `BioSequence{A}`. - -This particular method is specialised for BioSequences, and for when the Kmer -and BioSequence types used, share the same alphabet, since a lot of encoding / -decoding can be skipped, and the problem is mostly one of shunting bits around. -In the case where the alphabet of the Kmer and the alphabet of the BioSequence -differ, dispatch to the more generic constructor occurs instead. - -# Examples - -```jldoctest -julia> ntseq = LongSequence{DNAAlphabet{2}}("TTAGC") # 2-bit DNA alphabet -5nt DNA Sequence: -TTAGC +################################################ +# Constructors +################################################ -julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet -DNA 5-mer: -TTAGC -``` -""" -@inline function Kmer{A, K, N}(seq::BioSequence{A}) where {A, K, N} - checkmer(Kmer{A, K, N}) - - seqlen = length(seq) - if seqlen != K - throw(ArgumentError("seq is not the correct length ($seqlen ≠ $K)")) - end - - ## All based on alphabet type of Kmer, so should constant fold. - bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A, K, N}) - n_per_chunk = per_word_capacity(Kmer{A, K, N}) - - # Construct the head. - head = zero(UInt64) - @inbounds for i in 1:n_head - bits = UInt64(BioSequences.extract_encoded_element(seq, i)) - head = (head << bits_per_sym) | bits - end +zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), nsize(T)) - # And the rest of the sequence - idx = Ref(n_head + 1) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for _ in 1:n_per_chunk - bits = UInt64(BioSequences.extract_encoded_element(seq, idx[])) - body = (body << bits_per_sym) | bits - idx[] += 1 - end - return body +# Generic, unknown size +@inline function construct_generic(::Base.SizeUnknown, T::Type{<:Kmer{A, K}}, itr) where {A, K} + check_kmer(T) + data = zero_tuple(T) + nbits = BioSequences.bits_per_symbol(A()) + for (i, element) in enumerate(itr) + i > K && error("Length of sequence must be K elements to build Kmer") + symbol = convert(eltype(A), element) + carry = UInt(BioSequences.encode(A(), symbol)) + data = leftshift_carry(data, nbits, carry) end - - data = (head, tail...) - - return Kmer{A, K, N}(data) + T(unsafe, data) end -# Convenience version of function above so you don't have to work out correct N. -""" - Kmer{A,K}(itr) where {A,K} - -Construct a `Kmer{A,K,N}` from an iterable. - -This is a convenience method which will work out the correct `N` parameter, for -your given choice of `A` & `K`. -""" -@inline function Kmer{A, K}(itr) where {A, K} - T = kmertype(Kmer{A, K}) - return T(itr) -end - -""" - Kmer{A}(itr) where {A} - -Construct a `Kmer{A,K,N}` from an iterable. - -This is a convenience method which will work out K from the length of `itr`, and -the correct `N` parameter, for your given choice of `A` & `K`. - -!!! warning - Since this gets K from runtime values, this is gonna be slow! -""" -@inline Kmer{A}(itr) where {A} = Kmer{A, length(itr)}(itr) -@inline Kmer(seq::BioSequence{A}) where {A} = Kmer{A}(seq) - -function Kmer{A1}( - seq::BioSequence{A2}, -) where {A1 <: NucleicAcidAlphabet, A2 <: NucleicAcidAlphabet} - kmertype(Kmer{A1, length(seq)})(seq) -end - -@inline function Kmer{A}( - nts::Vararg{Union{DNA, RNA}, K}, -) where {A <: NucleicAcidAlphabet, K} - return kmertype(Kmer{A, K})(nts) -end - -""" - Kmer(nts::Vararg{DNA,K}) where {K} - -Construct a Kmer from a variable number `K` of DNA nucleotides. - -# Examples - -```jldoctest -julia> Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C) -DNA 5-mer: -TTAGC -``` -""" -@inline Kmer(nt::DNA, nts::Vararg{DNA}) = DNAKmer((nt, nts...)) - -""" - Kmer(nts::Vararg{RNA,K}) where {K} - -Construct a Kmer from a variable number `K` of RNA nucleotides. - -# Examples - -```jldoctest -julia> Kmer(RNA_U, RNA_U, RNA_A, RNA_G, RNA_C) -DNA 5-mer: -UUAGC -``` -""" -@inline Kmer(nt::RNA, nts::Vararg{RNA}) = RNAKmer((nt, nts...)) - -""" - Kmer(seq::String) - -Construct a DNA or RNA kmer from a string. - -!!! warning - As a convenience method, this derives the `K`, `Alphabet`, and `N` parameters - for the `Kmer{A,K,N}` type from the input string. - -# Examples - -```jldoctest -julia> Kmer("TTAGC") -DNA 5-mer: -TTAGC -``` -""" -@inline function Kmer(seq::String) - seq′ = BioSequences.remove_newlines(seq) - hast = false - hasu = false - for c in seq′ - hast |= ((c == 'T') | (c == 't')) - hasu |= ((c == 'U') | (c == 'u')) - end - if (hast & hasu) | (!hast & !hasu) - throw(ArgumentError("Can't detect alphabet type from string")) +# Generic, size known +@inline function construct_generic_unchecked(::Union{Base.HasLength, Base.HasShape}, T::Type{<:Kmer{A}}, itr) where A + check_kmer(T) + data = zero_tuple(T) + nbits = BioSequences.bits_per_symbol(A()) + for element in itr + symbol = convert(eltype(A), element) + carry = UInt(BioSequences.encode(A(), symbol)) + data = leftshift_carry(data, nbits, carry) end - A = ifelse(hast & !hasu, DNAAlphabet{2}, RNAAlphabet{2}) - return Kmer{A, length(seq′)}(seq′) + T(unsafe, data) end -""" - kmertype(::Type{Kmer{A,K}}) where {A,K} -Resolve and incomplete kmer typing, computing the N parameter of -`Kmer{A,K,N}`, given only `Kmer{A,K}`. -## Example -```julia -julia> DNAKmer{63} -Kmer{DNAAlphabet{2},63,N} where N -julia> kmertype(DNAKmer{63}) -Kmer{DNAAlphabet{2},63,2} -``` -""" -@inline function kmertype(::Type{Kmer{A, K}}) where {A, K} - return Kmer{A, K, BioSequences.seq_data_len(A, K)} +# Generic, size known but length not checked. +@inline function construct_generic(iT::Union{Base.HasLength, Base.HasShape}, T::Type{<:Kmer{A, K}}, itr) where {A, K} + length(s) == K || error("Length of sequence must be K elements to build Kmer") + construct_generic_unchecked(iT, T, itr) end -@inline kmertype(::Type{Kmer{A, K, N}}) where {A, K, N} = Kmer{A, K, N} - -# Aliases -"Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`" -const DNAKmer{K, N} = Kmer{DNAAlphabet{2}, K, N} - -"Shortcut for the type `DNAKmer{27,1}`" -const DNA27mer = DNAKmer{27, 1} - -"Shortcut for the type `DNAKmer{31,1}`" -const DNA31mer = DNAKmer{31, 1} - -"Shortcut for the type `DNAKmer{63,2}`" -const DNA63mer = DNAKmer{63, 2} - -"Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`" -const RNAKmer{K, N} = Kmer{RNAAlphabet{2}, K, N} - -"Shortcut for the type `RNAKmer{27,1}`" -const RNA27mer = RNAKmer{27, 1} - -"Shortcut for the type `RNAKmer{31,1}`" -const RNA31mer = RNAKmer{31, 1} -"Shortcut for the type `RNAKmer{63,2}`" -const RNA63mer = RNAKmer{63, 2} - -"Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`" -const AAKmer{K, N} = Kmer{AminoAcidAlphabet, K, N} - -"Shorthand for `DNAKmer{3,1}`" -const DNACodon = DNAKmer{3, 1} - -"Shorthand for `RNAKmer{3,1}`" -const RNACodon = RNAKmer{3, 1} - -@inline ksize(::Type{Kmer{A, K, N}}) where {A, K, N} = K -@inline nsize(::Type{Kmer{A, K, N}}) where {A, K, N} = N -@inline per_word_capacity(::Type{Kmer{A, K, N}}) where {A, K, N} = - div(64, BioSequences.bits_per_symbol(A())) -@inline per_word_capacity(seq::Kmer) = per_word_capacity(typeof(seq)) -@inline capacity(::Type{Kmer{A, K, N}}) where {A, K, N} = - per_word_capacity(Kmer{A, K, N}) * N -@inline capacity(seq::Kmer) = capacity(typeof(seq)) -@inline n_unused(::Type{Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K -@inline n_unused(seq::Kmer) = n_unused(typeof(seq)) -@inline elements_in_head(::Type{Kmer{A, K, N}}) where {A, K, N} = - per_word_capacity(Kmer{A, K, N}) - n_unused(Kmer{A, K, N}) -@inline elements_in_head(seq::Kmer) = elements_in_head(typeof(seq)) - - - -@inline Base.length(x::Kmer{A, K, N}) where {A, K, N} = K -@inline Base.summary(x::Kmer{A, K, N}) where {A, K, N} = string(eltype(x), ' ', K, "-mer") - -function Base.typemin(::Type{Kmer{A, K, N}}) where {A, K, N} - return Kmer{A, K, N}(unsafe, ntuple(i -> zero(UInt64), N)) +# BioSequences with the same Alphabet and these element types do not need to decode +# and encode, but can copy the raw bits directly into the kmer +@inline function construct_unchecked( + T::Type{<:Kmer{A}}, s::BioSequence{A}, data_eltype::Type{E} +) where {A, E <: Union{UInt8, UInt16, UInt32, UInt}} + check_kmer(T) + data = zero_tuple(T) + nbits = BioSequences.bits_per_symbol(A()) + for i in 1:K + data = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(s, i) % UInt) + end + T(unsafe, data) end -function Base.typemax(::Type{Kmer{A, K, N}}) where {A, K, N} - return Kmer{A, K, N}((typemax(UInt64), ntuple(i -> typemax(UInt64), N - 1)...)) +# BioSequence with another element type fall back to the generic length constructor +@inline function construct_unchecked(T::Type{<:Kmer}, s::BioSequence, data_eltype::Type) + construct_generic_unchecked(Base.HasLength(), T, s) end -@inline function rand_kmer_data(::Type{Kmer{A, K, N}}, ::Val{true}) where {A, K, N} - return Kmer{A, K, N}(ntuple(i -> rand(UInt64), Val{N}())) +# BioSequence must implement length so we don't need to dispatch on that. +# However, if the encoded data eltype is an unsigned, we can use a specialized method where we don't +# decode each symbol but simply move the encoded data directly into the tuple +function Kmer{A, K, N}(s::BioSequence) where {A, K, N} + length(s) == K || error("Length of sequence must be K elements to build Kmer") + construct_unchecked(T, s, BioSequences.encoded_data_eltype(typeof(s))) end -@inline function rand_kmer_data(::Type{Kmer{A, K, N}}, ::Val{false}) where {A, K, N} - ## All based on alphabet type of Kmer, so should constant fold. - bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A, K, N}) - n_per_chunk = per_word_capacity(Kmer{A, K, N}) - # Construct the head. - head = zero(UInt64) - @inbounds for i in 1:n_head - bits = UInt64(BioSequences.encode(A(), rand(symbols(A())))) - head = (head << bits_per_sym) | bits - end - # And the rest of the sequence - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for _ in 1:n_per_chunk - bits = UInt64(BioSequences.encode(A(), rand(symbols(A())))) - body = (body << bits_per_sym) | bits - end - return body - end - return (head, tail...) +# Generic constructor: Dispatch on the iteratorsize +function Kmer{A, K, N}(itr) where {A, K, N} + construct_generic(Base.IteratorSize(typeof(itr)), Kmer{A, K, N}, itr) end -""" - Base.rand(::Type{Kmer{A,K,N}}) where {A,K,N} - Base.rand(::Type{Kmer{A,K}}) where {A,K} - -Create a random kmer of a specified alphabet and length - -# Examples -```julia -julia> rand(Kmer{DNAAlphabet{2}, 3}) -BioSymbols.DNA 3-mer: -ACT - -``` -""" -@inline function Base.rand(::Type{Kmer{A, K, N}}) where {A, K, N} - checkmer(Kmer{A, K, N}) - return Kmer{A, K, N}(rand_kmer_data(Kmer{A, K, N}, BioSequences.iscomplete(A()))) +# To avoid having the O(N) length check. TODO: Use optimised method +function Kmer{A, K, N}(s::Union{String, SubString{String}}) where {A, K, N} + construct_generic(Base.SizeUnknown(), Kmer{A, K, N}, s) end -Base.rand(::Type{Kmer{A, K}}) where {A, K} = rand(kmertype(Kmer{A, K})) +################################################ +# Derived constructors +################################################ -function Base.rand(::Type{T}, size::Integer) where {T <: Kmer} - return [rand(T) for _ in 1:size] +# Where the parameters of the kmer is not specified in the constructor +function Kmer(s::BioSequence{A}) where A + K = length(s) + N = n_coding_elements(Kmer{A, K}) + Kmer{A, K, N}(s) end -### -### Old Mer Base Functions - not transferred to new type. -### -#@inline encoded_data_type(::Type{Mer{A,K}}) where {A,K} = UInt64 -#@inline encoded_data_type(::Type{BigMer{A,K}}) where {A,K} = UInt128 -#@inline encoded_data_type(x::AbstractMer) = encoded_data_type(typeof(x)) -#@inline encoded_data(x::AbstractMer) = reinterpret(encoded_data_type(typeof(x)), x) -#@inline ksize(::Type{T}) where {A,K,T<:AbstractMer{A,K}} = K -#@inline Base.unsigned(x::AbstractMer) = encoded_data(x) -#Base.:-(x::AbstractMer, y::Integer) = typeof(x)(encoded_data(x) - y % encoded_data_type(x)) -#Base.:+(x::AbstractMer, y::Integer) = typeof(x)(encoded_data(x) + y % encoded_data_type(x)) -#Base.:+(x::AbstractMer, y::AbstractMer) = y + x -#Alphabet(::Type{Mer{A,K} where A<:NucleicAcidAlphabet{2}}) where {K} = Any - -include("indexing.jl") - -#LongSequence{A}(x::Kmer{A,K,N}) where {A,K,N} = LongSequence{A}([nt for nt in x]) -# Convenience method so as don't need to specify A in LongSequence{A}. -BioSequences.LongSequence(x::Kmer{A, K, N}) where {A, K, N} = LongSequence{A}(x) - -include("predicates.jl") -include("counting.jl") -include("transformations.jl") - -### -### Kmer de-bruijn neighbors -### - -# TODO: Decide on this vs. old iterator pattern. I like the terseness of the code vs defining an iterator. Neither should allocate. -fw_neighbors(kmer::Kmer{A, K, N}) where {A <: DNAAlphabet, K, N} = - ntuple(i -> pushlast(kmer, ACGT[i]), Val{4}()) -fw_neighbors(kmer::Kmer{A, K, N}) where {A <: RNAAlphabet, K, N} = - ntuple(i -> pushlast(kmer, ACGU[i]), Val{4}()) -bw_neighbors(kmer::Kmer{A, K, N}) where {A <: DNAAlphabet, K, N} = - ntuple(i -> pushfirst(kmer, ACGT[i]), Val{4}()) -bw_neighbors(kmer::Kmer{A, K, N}) where {A <: RNAAlphabet, K, N} = - ntuple(i -> pushfirst(kmer, ACGU[i]), Val{4}()) - -#= -# Neighbors on a de Bruijn graph -struct KmerNeighborIterator{S<:Kmer} - x::S +# Where A, but not K is specified +function Kmer{A}(s::Union{String, SubString{String}}) where A + K = length(s) + N = n_coding_elements(Kmer{A, K}) + construct_generic_unchecked(Base.HasLength(), Kmer{A, K, N}, s) end -""" - neighbors(kmer::S) where {S<:Kmer} - -Return an iterator through skip-mers neighboring `skipmer` on a de Bruijn graph. -""" -neighbors(kmer::Kmer) = KmerNeighborIterator{typeof(kmer)}(kmer) +# TODO: Constructor from LongSequence and LongSubSeq +# where whole coding elements can be copied directly over +# without extracting individual elements -Base.length(::KmerNeighborIterator) = 4 -Base.eltype(::Type{KmerNeighborIterator{S}}) where {S<:Kmer} = S +# TODO: Kmer => LongSequence constructor, same as above but opposite, kinda. -function Base.iterate(it::KmerNeighborIterator{S}, i::UInt64 = 0) where {S<:Kmer} - if i == 4 - return nothing - else - #return S((encoded_data(it.x) << 2) | i), i + 1 - return it.x << 1, i + one(UInt64) - end -end -=# +# TODO: Constructor from String that predicts the alphabet? +# Maybe implement the guessparse function in BioSequences.jl +# (See related issue), then call it from here. -### -### String literals -### +################################################ +# String literals +################################################ macro mer_str(seq, flag) - seq′ = BioSequences.remove_newlines(seq) + trimmed = BioSequences.remove_newlines(seq) + # Unlike @dna_str, we default to 2-bit alphabets, because kmers + # by convention are usually 2-bit only if flag == "dna" || flag == "d" - T = kmertype(DNAKmer{length(seq′)}) - return T(seq′) + Kmer{DNAAlphabet{2}}(trimmed) elseif flag == "rna" || flag == "r" - T = kmertype(RNAKmer{length(seq′)}) - return T(seq′) - elseif flag == "aa" || flag == "a" || flag == "prot" || flag == "p" - T = kmertype(AAKmer{length(seq′)}) - return T(seq′) + Kmer{RNAAlphabet{2}}(trimmed) + elseif flag == "aa" || flag == "a" + Kmer{AminoAcidAlphabet}(trimmed) else error("Invalid type flag: '$(flag)'") end end -macro mer_str(seq) - seq′ = BioSequences.remove_newlines(seq) - T = kmertype(DNAKmer{length(seq′)}) - return T(seq′) -end +################## +# Various methods +################## + +# BioSequences interface +Base.length(x::Kmer) = ksize(typeof(x)) +Base.copy(x::Kmer) = x # immutable +BioSequences.encoded_data_eltype(::Type{<:Kmer}) = UInt +# BioSequences helper methods +BioSequences.encoded_data(seq::Kmer) = seq.data +# Misc methods +Base.summary(x::Kmer{A, K, N}) where {A, K, N} = string(eltype(x), ' ', K, "-mer") + +function Base.show(io::IO, ::MIME"text/plain", s::Kmer) + println(io, summary(s), ':') + print(io, s) +end + +function Base.print(io::IO, s::Kmer) + # TODO: Can be optimised but whatever + print(io, LongSequence(s)) +end diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index 3bbe025..90e542e 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -1,6 +1,7 @@ # TODO: this should end up in BioSequences.jl? +#= "Extract the element stored in a packed bitarray referred to by bidx." @inline function BioSequences.extract_encoded_element( bidx::BioSequences.BitIndex{N, W}, @@ -79,4 +80,63 @@ end return (_reverse(bpe, tail...)..., BioSequences.reversebits(head, bpe)) end -@inline _reverse(::BioSequences.BitsPerSymbol{N}) where {N} = () \ No newline at end of file +@inline _reverse(::BioSequences.BitsPerSymbol{N}) where {N} = () +=# + + + +@inline function leftshift_carry( + x::NTuple{N, UInt64}, + nbits::Integer, + prevcarry::UInt64=zero(UInt64), +) where {N} + _, newbits = _leftshift_carry(nbits, prevcarry, x...) + return newbits +end + +@inline function _leftshift_carry(nbits::Integer, prevcarry::UInt64, head::UInt64, tail...) + carry, newtail = _leftshift_carry(nbits, prevcarry, tail...) + return head >> (64 - nbits), ((head << nbits) | carry, newtail...) +end + +@inline _leftshift_carry(nbits::Integer, prevcarry::UInt64) = prevcarry, () + + + + + +@inline function left_shift(x::Unsigned, n::Integer) + x << (n & ((sizeof(x) * 8) - 1)) +end + +@inline function right_shift(x::Unsigned, n::Integer) + x >>> (n & ((sizeof(x) * 8) - 1)) +end + +@inline function left_carry(x::Unsigned, n::Integer) + right_shift(x, 8 * sizeof(x) - n) +end + +@inline function right_carry(x::Unsigned, n::Integer) + left_shift(x, 8 * sizeof(x) - n) +end + +function leftshift_carry(x::Tuple{Vararg{T}}, nbits::Integer, carry::T) where {T <: Unsigned} + head, tail... = x + new_head = left_shift(head, nbits) | carry + tail_carry = left_carry(head, nbits) + (new_carry, new_tail) = leftshift_carry(tail, nbits, tail_carry) + (new_carry, (new_head, new_tail...)) +end + +function rightshift_carry(x::Tuple{Vararg{T}}, nbits::Integer, carry::T) where {T <: Unsigned} + head, tail... = x + new_head = right_shift(head, nbits) | carry + tail_carry = right_carry(head, nbits) + (new_carry, new_tail) = rightshift_carry(tail, nbits, tail_carry) + (new_carry, (new_head, new_tail...)) +end + +leftshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) +rightshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) + From cfaacb3f1719a1daef3e0ad9dd4045ff5ebdebc0 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 21 Jul 2023 11:29:41 +0200 Subject: [PATCH 04/33] More stuff --- src/Kmers.jl | 1 + src/indexing.jl | 31 +++++++++++++++++- src/kmer.jl | 68 +++++++++++++++++++++++++++++++--------- src/predicates.jl | 11 ------- src/transformations.jl | 41 ++++++++++++++++++++++++ src/tuple_bitflipping.jl | 50 +++++++++++------------------ 6 files changed, 144 insertions(+), 58 deletions(-) delete mode 100644 src/predicates.jl diff --git a/src/Kmers.jl b/src/Kmers.jl index 9592284..a319b35 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -143,6 +143,7 @@ const unsafe = Unsafe() include("tuple_bitflipping.jl") include("kmer.jl") include("indexing.jl") +include("transformations.jl") #= include("revtrans.jl") include("kmer_iteration/AbstractKmerIterator.jl") diff --git a/src/indexing.jl b/src/indexing.jl index a7f99e2..6b186d7 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -7,4 +7,33 @@ right_shift(@inbounds(seq.data[index]), offset) & mask end -# TODO: Index with range, index with bitvector \ No newline at end of file +# This is usually type unstable, but in user code, users may use constant-folded ranges, +# e.g. f(x) = x[2:4]. In this case, we need it to compile to very efficient code. +# Hence, it MUST use @inline +@inline function Base.getindex(kmer::Kmer{A}, range::AbstractRange{<:Integer}) where A + @boundscheck checkbounds(kmer, range) + K = length(range) + N = n_coding_elements(Kmer{A, K}) + T = Kmer{A, K, N} + data = zero_tuple(T) + nbits = BioSequences.bits_per_symbol(A()) + for i in range + (_, data) = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) + end + T(unsafe, data) +end + +# Same as above: This needs to be able to inline if the indices are known statically +@inline function Base.getindex(kmer::Kmer{A}, indices::AbstractVector{Bool}) where A + @boundscheck checkbounds(eachindex(kmer), indices) + K = sum(indices) + N = n_coding_elements(Kmer{A, K}) + T = Kmer{A, K, N} + data = zero_tuple(T) + nbits = BioSequences.bits_per_symbol(A()) + for (i, bool) in enumerate(indices) + bool || continue + (_, data) = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) + end + T(unsafe, data) +end \ No newline at end of file diff --git a/src/kmer.jl b/src/kmer.jl index d3b3923..cbd9eb0 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -22,6 +22,7 @@ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} # Hence, a sequence A-G of 16-bit elements would pack like: # ( ABC, DEFG) # ^ 16 unused bits, the unused bits are always top bits of first UInt + # Unused bits are always zero data::NTuple{N, UInt} # This unsafe method do not clip the head @@ -29,12 +30,6 @@ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} check_kmer(Kmer{A, K, N}) new{A, K, N}(data) end - - function Kmer{A, K, N}(data::NTuple{N, UInt}) where {A <: Alphabet, K, N} - check_kmer(Kmer{A, K, N}) - x = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) - return new(cliphead(x, data...)) - end end # Aliases @@ -86,6 +81,9 @@ end @inline ksize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = K @inline nsize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = N @inline n_unused(::Type{<:Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K +@inline bits_unused(T::Type{<:Kmer{A}}) where A = n_unused(T) * BioSequences.bits_per_symbol(A()) + +@inline BioSequences.Alphabet(::Kmer{A}) where A = A() @inline function n_coding_elements(::Type{<:Kmer{A, K}}) where {A, K} cld(BioSequences.bits_per_symbol(A()) * K, 8 * sizeof(UInt)) @@ -107,7 +105,7 @@ end # Constructors ################################################ -zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), nsize(T)) +zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), Val{nsize(T)}()) # Generic, unknown size @inline function construct_generic(::Base.SizeUnknown, T::Type{<:Kmer{A, K}}, itr) where {A, K} @@ -118,7 +116,7 @@ zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), nsize(T)) i > K && error("Length of sequence must be K elements to build Kmer") symbol = convert(eltype(A), element) carry = UInt(BioSequences.encode(A(), symbol)) - data = leftshift_carry(data, nbits, carry) + (_, data) = leftshift_carry(data, nbits, carry) end T(unsafe, data) end @@ -131,14 +129,14 @@ end for element in itr symbol = convert(eltype(A), element) carry = UInt(BioSequences.encode(A(), symbol)) - data = leftshift_carry(data, nbits, carry) + (_, data) = leftshift_carry(data, nbits, carry) end T(unsafe, data) end # Generic, size known but length not checked. @inline function construct_generic(iT::Union{Base.HasLength, Base.HasShape}, T::Type{<:Kmer{A, K}}, itr) where {A, K} - length(s) == K || error("Length of sequence must be K elements to build Kmer") + length(itr) == K || error("Length of sequence must be K elements to build Kmer") construct_generic_unchecked(iT, T, itr) end @@ -150,14 +148,14 @@ end check_kmer(T) data = zero_tuple(T) nbits = BioSequences.bits_per_symbol(A()) - for i in 1:K - data = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(s, i) % UInt) + for i in 1:ksize(T) + (_, data) = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(s, i) % UInt) end T(unsafe, data) end # BioSequence with another element type fall back to the generic length constructor -@inline function construct_unchecked(T::Type{<:Kmer}, s::BioSequence, data_eltype::Type) +@inline function construct_unchecked(T::Type{<:Kmer{A}}, s::BioSequence{A}, data_eltype::Type) where A construct_generic_unchecked(Base.HasLength(), T, s) end @@ -166,7 +164,7 @@ end # decode each symbol but simply move the encoded data directly into the tuple function Kmer{A, K, N}(s::BioSequence) where {A, K, N} length(s) == K || error("Length of sequence must be K elements to build Kmer") - construct_unchecked(T, s, BioSequences.encoded_data_eltype(typeof(s))) + construct_unchecked(Kmer{A, K, N}, s, BioSequences.encoded_data_eltype(typeof(s))) end # Generic constructor: Dispatch on the iteratorsize @@ -250,3 +248,45 @@ function Base.print(io::IO, s::Kmer) # TODO: Can be optimised but whatever print(io, LongSequence(s)) end + +Base.cmp(x::T, y::T) where {T <: Kmer} = cmp(x.data, y.data) +Base.:(==)(x::Kmer{A}, y::Kmer{A}) where A = x.data == y.data +Base.isless(x::T, y::T) where {T <: Kmer} = isless(x.data, y.data) + +# TODO: We need to figure out what to do with hashing first. +# Per the contract of isequal, isequal(a, b) == (hash(a) == hash(b)). +# Further, it's imperative that hashing kmers is absolutely optimal. +# So, what to do? +Base.isequal(x::Kmer, y::BioSequence) = false +Base.isequal(x::BioSequence, y::Kmer) = false + +# TODO: Ensure this is the right way to go. +# See https://github.com/BioJulia/BioSequences.jl/pull/121#discussion_r475234270 +Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) + +function push(kmer::Kmer{A}, s::BioSequences.BioSymbol) where A + bps = BioSequences.bits_per_symbol(A()) + encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) + (head, tail...) = new_data + head = head | left_shift(encoding, (elements_in_head(typeof(kmer)) - 1) * bps) + typeof(kmer)(unsafe, (head, tail...)) +end + +function pushlast(kmer::Kmer{A}, s::BioSequences.BioSymbol) where A + bps = BioSequences.bits_per_symbol(A()) + encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + (_, new_data) = leftshift_carry(kmer.data, bps, encoding) + (head, tail...) = new_data + typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) +end + +######################################## +# Various bit-twiddling useful functions +######################################## + +# Get a mask 0x0001111 ... masking away the unused bits of the head element +# in the UInt tuple +@inline function get_mask(T::Type{<:Kmer}) + UInt(1) << bits_unused(T) - 1 +end \ No newline at end of file diff --git a/src/predicates.jl b/src/predicates.jl deleted file mode 100644 index 5a79f3c..0000000 --- a/src/predicates.jl +++ /dev/null @@ -1,11 +0,0 @@ -### -### Mer specific specializations of src/biosequence/predicates.jl -### - -Base.cmp(x::T, y::T) where {T <: Kmer} = cmp(x.data, y.data) -Base.:(==)(x::T, y::T) where {T <: Kmer} = x.data == y.data -Base.isless(x::T, y::T) where {T <: Kmer} = isless(x.data, y.data) - -# TODO: Ensure this is the right way to go. -# See https://github.com/BioJulia/BioSequences.jl/pull/121#discussion_r475234270 -Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) diff --git a/src/transformations.jl b/src/transformations.jl index 6f898c2..1625bdb 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -1,4 +1,44 @@ +function Base.reverse(x::Kmer) + # ( ABC, DEFG) # reverse each element + # (CBA , GFED) # reverse elements + # (GFED, CBA ) # rightshift carry a zero + # ( GFE, DBCA) # final result + Bps = BioSequences.BitsPerSymbol(Alphabet(x)) + data = map(i -> BioSequences.reversebits(i, Bps), reverse(x.data)) + (_, data) = rightshift_carry(data, bits_unused(typeof(x)), zero(UInt)) + typeof(x)(unsafe, data) +end + +# For this method, we don't need to mask the unused bits, because the complement of +# 0x0 == DNA_Gap is still DNA_Gap +function BioSequences.complement(x::Kmer{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}}) + isempty(x) && return x + data = map(i -> BioSequences.complement_bitpar(i, Alphabet(x)), x.data) + typeof(x)(unsafe, data) +end + +# For this method we do +function BioSequences.complement(x::Kmer{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}) + isempty(x) && return x + data = map(i -> BioSequences.complement_bitpar(i, Alphabet(x)), x.data) + (head, tail...) = data + typeof(x)(unsafe, ((head & get_mask(typeof(x))), tail...)) +end + +# Generic fallback +function BioSequences.complement(x::Kmer{<:NucleicAcidAlphabet}) + construct_generic_unchecked(Base.HasLength(), typeof(x), (complement(i) for i in x)) +end +# TODO: Should this be the generic BioSequence def in BioSequences.jl? +function BioSequences.reverse_complement(x::Kmer) + reverse(complement(x)) +end + + + + +#= # Bit-parallel element nucleotide complementation @inline function _complement_bitpar( a::A, @@ -223,3 +263,4 @@ function BioSequences.translate( end return T(data) end +=# \ No newline at end of file diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index 90e542e..0858748 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -83,28 +83,8 @@ end @inline _reverse(::BioSequences.BitsPerSymbol{N}) where {N} = () =# - - -@inline function leftshift_carry( - x::NTuple{N, UInt64}, - nbits::Integer, - prevcarry::UInt64=zero(UInt64), -) where {N} - _, newbits = _leftshift_carry(nbits, prevcarry, x...) - return newbits -end - -@inline function _leftshift_carry(nbits::Integer, prevcarry::UInt64, head::UInt64, tail...) - carry, newtail = _leftshift_carry(nbits, prevcarry, tail...) - return head >> (64 - nbits), ((head << nbits) | carry, newtail...) -end - -@inline _leftshift_carry(nbits::Integer, prevcarry::UInt64) = prevcarry, () - - - - - +# These compile to raw CPU instructions and are therefore more +# efficient than simply using << and >>> @inline function left_shift(x::Unsigned, n::Integer) x << (n & ((sizeof(x) * 8) - 1)) end @@ -113,6 +93,8 @@ end x >>> (n & ((sizeof(x) * 8) - 1)) end +# When the UInt is shifted n bits, these are the bits +# that are shifted away (carried over) @inline function left_carry(x::Unsigned, n::Integer) right_shift(x, 8 * sizeof(x) - n) end @@ -121,22 +103,26 @@ end left_shift(x, 8 * sizeof(x) - n) end -function leftshift_carry(x::Tuple{Vararg{T}}, nbits::Integer, carry::T) where {T <: Unsigned} +# Shift a tuple left nbits, carry over bits between tuple elements, and OR +# the `carry` argument to the right side of the resulting tuple. +# Returns (new_carry, new_tuple) +@inline function leftshift_carry(x::Tuple{Vararg{T}}, nbits::Integer, carry::T) where {T <: Unsigned} head, tail... = x - new_head = left_shift(head, nbits) | carry - tail_carry = left_carry(head, nbits) - (new_carry, new_tail) = leftshift_carry(tail, nbits, tail_carry) - (new_carry, (new_head, new_tail...)) + (new_carry, new_tail) = leftshift_carry(tail, nbits, carry) + new_head = left_shift(head, nbits) | new_carry + (left_carry(head, nbits), (new_head, new_tail...)) end -function rightshift_carry(x::Tuple{Vararg{T}}, nbits::Integer, carry::T) where {T <: Unsigned} +@inline function rightshift_carry(x::Tuple{Vararg{T}}, nbits::Integer, carry::T) where {T <: Unsigned} head, tail... = x - new_head = right_shift(head, nbits) | carry - tail_carry = right_carry(head, nbits) + new_head = right_shift(head, nbits) | right_carry(carry, nbits) + mask = left_shift(UInt(1), nbits) - 1 + tail_carry = head & mask (new_carry, new_tail) = rightshift_carry(tail, nbits, tail_carry) (new_carry, (new_head, new_tail...)) end -leftshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) -rightshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) +# Recusion terminator for above +@inline leftshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) +@inline rightshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) From 2c64877f900ebe3b19da7acce473f259476cca0e Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 21 Jul 2023 16:06:51 +0200 Subject: [PATCH 05/33] Add revtrans and setindex --- src/Kmers.jl | 2 +- src/indexing.jl | 21 ++++ src/kmer.jl | 7 ++ src/revtrans.jl | 4 +- src/transformations.jl | 231 +---------------------------------------- 5 files changed, 36 insertions(+), 229 deletions(-) diff --git a/src/Kmers.jl b/src/Kmers.jl index a319b35..9e8ac9a 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -144,8 +144,8 @@ include("tuple_bitflipping.jl") include("kmer.jl") include("indexing.jl") include("transformations.jl") -#= include("revtrans.jl") +#= include("kmer_iteration/AbstractKmerIterator.jl") include("kmer_iteration/EveryKmer.jl") include("kmer_iteration/SpacedKmers.jl") diff --git a/src/indexing.jl b/src/indexing.jl index 6b186d7..d38e921 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -36,4 +36,25 @@ end (_, data) = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) end T(unsafe, data) +end + +@inline function BioSequences.bitindex(kmer::Kmer, i::Unsigned)::Tuple{UInt, UInt} + bps = BioSequences.bits_per_symbol(Alphabet(kmer)) % UInt + bpe = (8 * sizeof(UInt)) % UInt + (i, o) = divrem((UInt(i) - UInt(1) + n_unused(typeof(kmer))) * bps, bpe) + o = bpe - o - bps + i + 1, o +end + +@inline function setindex(kmer::Kmer, i::Integer, s) + @boundscheck checkbounds(kmer, i) + bps = BioSequences.bits_per_symbol(Alphabet(kmer)) + symbol = convert(eltype(kmer), s) + encoding = UInt(BioSequences.encode(Alphabet(kmer), symbol)) + (i, o) = BioSequences.bitindex(kmer, i % UInt) + element = @inbounds kmer.data[i] + mask = left_shift(UInt(1) << bps - 1, o) + element &= ~mask + element |= left_shift(encoding, o) + typeof(kmer)(unsafe, @inbounds Base.setindex(kmer.data, element, i)) end \ No newline at end of file diff --git a/src/kmer.jl b/src/kmer.jl index cbd9eb0..9023b6c 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -195,6 +195,13 @@ function Kmer{A}(s::Union{String, SubString{String}}) where A construct_generic_unchecked(Base.HasLength(), Kmer{A, K, N}, s) end +# With a different A +function Kmer{A}(s::BioSequence) where A + K = length(s) + N = n_coding_elements(Kmer{A, K}) + Kmer{A, K, N}(s) +end + # TODO: Constructor from LongSequence and LongSubSeq # where whole coding elements can be copied directly over # without extracting individual elements diff --git a/src/revtrans.jl b/src/revtrans.jl index 7cb021b..e43a79e 100644 --- a/src/revtrans.jl +++ b/src/revtrans.jl @@ -36,7 +36,7 @@ CodonSet() = CodonSet(UInt64(0), Unsafe()) CodonSet(itr) = foldl(push, itr; init=CodonSet()) function Base.iterate(x::CodonSet, s::UInt64=x.x) - codon = RNACodon((trailing_zeros(s) % UInt64,)) + codon = RNACodon(unsafe, (trailing_zeros(s) % UInt64,)) iszero(s) ? nothing : (codon, s & (s - 1)) end @@ -99,7 +99,7 @@ function ReverseGeneticCode(x::BioSequences.GeneticCode) x_set = CodonSet() for i in Int64(0):Int64(63) aa = x.tbl[i + 1] - codon = RNACodon((i % UInt64,)) + codon = RNACodon(unsafe, (i % UInt64,)) sets[ind(aa)] = push(sets[ind(aa)], codon) if aa !== AA_Term x_set = push(x_set, codon) diff --git a/src/transformations.jl b/src/transformations.jl index 1625bdb..9383e29 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -35,232 +35,11 @@ function BioSequences.reverse_complement(x::Kmer) reverse(complement(x)) end - - - -#= -# Bit-parallel element nucleotide complementation -@inline function _complement_bitpar( - a::A, - head::UInt64, - tail..., -) where {A <: NucleicAcidAlphabet} - return (BioSequences.complement_bitpar(head, A()), _complement_bitpar(a, tail...)...) -end - -@inline _complement_bitpar(a::A) where {A <: NucleicAcidAlphabet} = () - -@inline function pushfirst(x::Kmer{A, K, N}, nt) where {A, K, N} - ntbits = UInt64(BioSequences.encode(A(), nt)) << (62 - (64N - 2K)) - #ntbits = UInt64(@inbounds BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) << (62 - (64N - 2K)) - return Kmer{A, K, N}(_rightshift_carry(2, ntbits, x.data...)) +function BioSequences.canonical(x::Kmer) + rc = reverse_complement(x) + ifelse(x < rc, x, rc) end -@inline function pushlast(x::Kmer{A, K, N}, nt) where {A, K, N} - ntbits = UInt64(BioSequences.encode(A(), nt)) - #ntbits = UInt64(@inbounds BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) - _, newbits = _leftshift_carry(2, ntbits, x.data...) - return Kmer{A, K, N}(newbits) -end - -### -### Transformation methods -### - -""" - complement(seq::T) where {T<:Kmer} - -Return a kmer's complement kmer. - -# Examples - -```jldoctest -julia> complement(Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)) -DNA 5-mer: -AATCG -``` -""" -@inline function BioSequences.complement(seq::T) where {T <: Kmer} - return T(_complement_bitpar(Alphabet(seq), seq.data...)) -end - -""" - reverse(seq::Kmer{A,K,N}) where {A,K,N} - -Return a kmer that is the reverse of the input kmer. - -# Examples - -```jldoctest -julia> reverse(Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)) -DNA 5-mer: -CGATT -``` -""" -@inline function Base.reverse(seq::Kmer{A, K, N}) where {A, K, N} - rdata = _reverse(BioSequences.BitsPerSymbol(seq), seq.data...) - # rshift should constant-fold. - rshift = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) - return Kmer{A, K, N}(rightshift_carry(rdata, rshift)) # based on only 2 bit alphabet. -end - -""" - reverse_complement(seq::Kmer) - -Return the kmer that is the reverse complement of the input kmer. - -# Examples - -```jldoctest -julia> reverse_complement(Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)) -DNA 5-mer: -GCTAA -``` -""" -@inline function BioSequences.reverse_complement(seq::Kmer{A, K, N}) where {A, K, N} - return complement(reverse(seq)) -end +BioSequences.iscanonical(x::Kmer) = x <= reverse_complement(x) -#= -@inline function reverse_complement2(seq::Kmer{A,K,N}) where {A,K,N} - f = x -> complement_bitpar(x, A()) - rdata = _reverse(f, BioSequences.BitsPerSymbol(seq), seq.data...) - return Kmer{A,K,N}(rightshift_carry(rdata, 64N - 2K)) -end -=# - -""" - BioSequences.canonical(seq::Kmer{A,K,N}) where {A,K,N} - -Return the canonical sequence of `seq`. - -A canonical sequence is the numerical lesser of a kmer and its reverse complement. -This is useful in hashing/counting sequences in data that is not strand specific, -and thus observing the short sequence is equivalent to observing its reverse complement. - -# Examples - -```jldoctest -julia> canonical(Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)) -DNA 5-mer: -GCTAA -``` -""" -@inline function BioSequences.canonical(seq::Kmer{A, K, N}) where {A, K, N} - if N < 4 - return min(seq, reverse_complement(seq)) - else - return iscanonical(seq) ? seq : reverse_complement(seq) - end -end - -### -### Old Mer specific specializations of src/biosequence/transformations.jl -### - not currently transferred to new type. - -# TODO: Sort this and decide on transferring to new NTuple based kmers or no. - -#= -function swap(x::T, i, j) where {T<:AbstractMer} - i = 2 * length(x) - 2i - j = 2 * length(x) - 2j - b = encoded_data(x) - x = ((b >> i) ⊻ (b >> j)) & encoded_data_type(x)(0x03) - return T(b ⊻ ((x << i) | (x << j))) -end - -function Random.shuffle(x::T) where {T<:AbstractMer} - # Fisher-Yates shuffle for mers. - j = lastindex(x) - for i in firstindex(x):(j - 1) - j′ = rand(i:j) - x = swap(x, i, j′) - end - return x -end -=# - -throw_translate_err(K) = error("Cannot translate Kmer of size $K not divisible by 3") - -@inline function setup_translate(seq::Kmer{<:NucleicAcidAlphabet, K}) where {K} - naa, rem = divrem(K, 3) - iszero(rem) || throw_translate_err(K) - kmertype(AAKmer{naa}) -end - -# This sets the first amino acid to methionine, returning the data tuple -@inline function set_methionine_data(data::Tuple{Vararg{UInt64}}, ::Val{K}) where {K} - offset = ((K - 1) * 8) & 63 - mask = ~(UInt64(0xff) << offset) # mask off existing AA in pos 1 - addition = UInt64(0x0c) << offset # 0x0c is encoded methionine - chunk, rest... = data - chunk = (chunk & mask) | addition - return (chunk, rest...) -end - -function BioSequences.translate( - seq::Union{RNAKmer, DNAKmer}; - code=BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool=true, # a noop for this method - alternative_start::Bool=false, -) - T = setup_translate(seq) - data = blank_ntuple(T) - for i in 1:ksize(T) - a = seq[3 * i - 2] - b = seq[3 * i - 1] - c = seq[3 * i - 0] - codon = BioSequences.unambiguous_codon(a, b, c) - aa = code[codon] - # Next line is equivalent to encode, but without checking. - # We assume genetic codes do not code to invalid data. - enc_data = reinterpret(UInt8, aa) % UInt64 - data = leftshift_carry(data, 8, enc_data) - end - # This is probably not needed for kmers, but kept for compatibility. - # It does slightly slow down translation, even when not taken. - if alternative_start && !iszero(ksize(T)) - data = set_methionine_data(data, Val(ksize(T))) - end - return T(data) -end - -# See the function above for comments, or the equivalent function -# in BioSequences -function BioSequences.translate( - seq::Kmer{<:NucleicAcidAlphabet}; - code=BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool=true, - alternative_start::Bool=false, -) - T = setup_translate(seq) - data = blank_ntuple(T) - for i in 1:ksize(T) - a = reinterpret(RNA, seq[3 * i - 2]) - b = reinterpret(RNA, seq[3 * i - 1]) - c = reinterpret(RNA, seq[3 * i - 0]) - aa = - if BioSequences.isambiguous(a) | - BioSequences.isambiguous(b) | - BioSequences.isambiguous(c) - aa_ = BioSequences.try_translate_ambiguous_codon(code, a, b, c) - if aa_ === nothing - if allow_ambiguous_codons - aa_ = AA_X - else - error("codon ", a, b, c, " cannot be unambiguously translated") - end - end - aa_ - else - code[BioSequences.unambiguous_codon(a, b, c)] - end - enc_data = reinterpret(UInt8, aa) % UInt64 - data = leftshift_carry(data, 8, enc_data) - end - if alternative_start && !iszero(ksize(T)) - data = set_methionine_data(data, Val(ksize(T))) - end - return T(data) -end -=# \ No newline at end of file +# TODO: Translation \ No newline at end of file From 1ecb76cadcdce0cac47967def219eda31103ab2a Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 21 Jul 2023 16:37:01 +0200 Subject: [PATCH 06/33] Add translation --- src/kmer.jl | 4 +-- src/transformations.jl | 61 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/src/kmer.jl b/src/kmer.jl index 9023b6c..ca3f50e 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -144,7 +144,7 @@ end # and encode, but can copy the raw bits directly into the kmer @inline function construct_unchecked( T::Type{<:Kmer{A}}, s::BioSequence{A}, data_eltype::Type{E} -) where {A, E <: Union{UInt8, UInt16, UInt32, UInt}} +) where {A <: Alphabet, E <: Union{UInt8, UInt16, UInt32, UInt}} check_kmer(T) data = zero_tuple(T) nbits = BioSequences.bits_per_symbol(A()) @@ -155,7 +155,7 @@ end end # BioSequence with another element type fall back to the generic length constructor -@inline function construct_unchecked(T::Type{<:Kmer{A}}, s::BioSequence{A}, data_eltype::Type) where A +@inline function construct_unchecked(T::Type{<:Kmer}, s::BioSequence, data_eltype::Type) construct_generic_unchecked(Base.HasLength(), T, s) end diff --git a/src/transformations.jl b/src/transformations.jl index 9383e29..4ad7651 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -42,4 +42,63 @@ end BioSequences.iscanonical(x::Kmer) = x <= reverse_complement(x) -# TODO: Translation \ No newline at end of file +function translate( + seq::Kmer{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}; + code::BioSequences.GeneticCode = BioSequences.standard_genetic_code, + allow_ambiguous_codons::Bool = true, # noop in this method + alternative_start::Bool = false +) + n_aa, remainder = divrem(length(seq), 3) + iszero(remainder) || error("LongRNA length is not divisible by three. Cannot translate.") + N = n_coding_elements(Kmer{AminoAcidAlphabet, n_aa}) + T = Kmer{AminoAcidAlphabet, n_aa, N} + data = zero_tuple(T) + @inbounds for i in 1:n_aa + a = seq[3i-2] + b = seq[3i-1] + c = seq[3i-0] + codon = BioSequences.unambiguous_codon(a, b, c) + aa = code[codon] + carry = UInt(reinterpret(UInt8, aa)) + (_, data) = leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) + end + result = T(unsafe, data) + if alternative_start && !iszero(ksize(seq)) + return setindex(result, 1, AA_M) + else + return result + end +end + +function translate( + seq::Kmer{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}}; + code::BioSequences.GeneticCode = BioSequences.standard_genetic_code, + allow_ambiguous_codons::Bool = true, # noop in this method + alternative_start::Bool = false +) + n_aa, remainder = divrem(length(seq), 3) + iszero(remainder) || error("LongRNA length is not divisible by three. Cannot translate.") + N = n_coding_elements(Kmer{AminoAcidAlphabet, n_aa}) + T = Kmer{AminoAcidAlphabet, n_aa, N} + data = zero_tuple(T) + @inbounds for i in 1:n_aa + a = reinterpret(RNA, seq[3i-2]) + b = reinterpret(RNA, seq[3i-1]) + c = reinterpret(RNA, seq[3i-0]) + aa = if isgap(a) | isgap(b) | isgap(c) + error("Cannot translate nucleotide sequences with gaps.") + elseif iscertain(a) & iscertain(b) & iscertain(c) + code[BioSequences.unambiguous_codon(a, b, c)] + else + BioSequences.try_translate_ambiguous_codon(code, a, b, c, allow_ambiguous_codons) + end + carry = UInt(reinterpret(UInt8, aa)) + (_, data) = leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) + end + result = T(unsafe, data) + if alternative_start && !iszero(ksize(seq)) + return setindex(result, 1, AA_M) + else + return result + end +end \ No newline at end of file From 7385d9ba00f489cfbc7248162e831cb989ddfe9b Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 22 Jul 2023 15:33:30 +0200 Subject: [PATCH 07/33] Make EveryKmer iterator --- src/Kmers.jl | 3 + src/counting.jl | 67 ------- src/indexing.jl | 4 +- src/iterators/EveryKmer.jl | 168 ++++++++++++++++++ src/iterators/common.jl | 7 + src/kmer.jl | 94 ++++++---- .../AbstractKmerIterator.jl | 0 .../EveryCanonicalKmer.jl | 0 src/{kmer_iteration => old_iter}/EveryKmer.jl | 0 .../SpacedCanonicalKmers.jl | 0 .../SpacedKmers.jl | 0 src/transformations.jl | 2 +- 12 files changed, 244 insertions(+), 101 deletions(-) delete mode 100644 src/counting.jl create mode 100644 src/iterators/EveryKmer.jl create mode 100644 src/iterators/common.jl rename src/{kmer_iteration => old_iter}/AbstractKmerIterator.jl (100%) rename src/{kmer_iteration => old_iter}/EveryCanonicalKmer.jl (100%) rename src/{kmer_iteration => old_iter}/EveryKmer.jl (100%) rename src/{kmer_iteration => old_iter}/SpacedCanonicalKmers.jl (100%) rename src/{kmer_iteration => old_iter}/SpacedKmers.jl (100%) diff --git a/src/Kmers.jl b/src/Kmers.jl index 9e8ac9a..5c74689 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -145,6 +145,9 @@ include("kmer.jl") include("indexing.jl") include("transformations.jl") include("revtrans.jl") + +include("iterators/common.jl") +include("iterators/EveryKmer.jl") #= include("kmer_iteration/AbstractKmerIterator.jl") include("kmer_iteration/EveryKmer.jl") diff --git a/src/counting.jl b/src/counting.jl deleted file mode 100644 index edefc1e..0000000 --- a/src/counting.jl +++ /dev/null @@ -1,67 +0,0 @@ -### -### Mer specific specializations of src/biosequence/counting.jl -### - -for i in [ - (:_count_a, :a_bitcount), - (:_count_c, :c_bitcount), - (:_count_g, :g_bitcount), - (:_count_t, :t_bitcount), -] - @eval begin - @inline function $(i[1])( - alph::A, - head::UInt64, - tail..., - ) where {A <: NucleicAcidAlphabet} - return $BioSequences.$(i[2])(head, alph) + $(i[1])(alph, tail...) - end - @inline $(i[1])(alph::A) where {A <: NucleicAcidAlphabet} = 0 - end -end - -@inline function _count_gc(alph::A, head::UInt64, tail...) where {A <: NucleicAcidAlphabet} - return BioSequences.gc_bitcount(head, alph) + _count_gc(alph, tail...) -end -@inline _count_gc(::A) where {A <: NucleicAcidAlphabet} = 0 - -count_a(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = - _count_a(A(), x.data...) - n_unused(x) -count_c(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = _count_c(A(), x.data...) -count_g(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = _count_g(A(), x.data...) -count_t(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = _count_t(A(), x.data...) - -count_gc(x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = - _count_gc(A(), x.data...) -Base.count(::typeof(isGC), x::Kmer{A, K, N}) where {A <: NucleicAcidAlphabet, K, N} = - count_gc(x) - -# TODO: Expand to Amino Acid Kmers as well... -@inline function Base.count( - ::typeof(!=), - a::Kmer{A, K, N}, - b::Kmer{A, K, N}, -) where {A <: NucleicAcidAlphabet, K, N} - ad = a.data - bd = b.data - sum = 0 - @inbounds for i in 1:N - sum += BioSequences.mismatch_bitcount(ad[i], bd[i], A()) - end - return sum -end - -# TODO: Expand to Amino Acid Kmers as well... -@inline function Base.count( - ::typeof(==), - a::Kmer{A, K, N}, - b::Kmer{A, K, N}, -) where {A <: NucleicAcidAlphabet, K, N} - ad = a.data - bd = b.data - sum = 0 - @inbounds for i in 1:N - sum += BioSequences.match_bitcount(ad[i], bd[i], A()) - end - return sum - n_unused(a) -end diff --git a/src/indexing.jl b/src/indexing.jl index d38e921..fcfa948 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -12,9 +12,7 @@ end # Hence, it MUST use @inline @inline function Base.getindex(kmer::Kmer{A}, range::AbstractRange{<:Integer}) where A @boundscheck checkbounds(kmer, range) - K = length(range) - N = n_coding_elements(Kmer{A, K}) - T = Kmer{A, K, N} + T = derive_type(Kmer{A, length(range)}) data = zero_tuple(T) nbits = BioSequences.bits_per_symbol(A()) for i in range diff --git a/src/iterators/EveryKmer.jl b/src/iterators/EveryKmer.jl new file mode 100644 index 0000000..434ccf3 --- /dev/null +++ b/src/iterators/EveryKmer.jl @@ -0,0 +1,168 @@ +""" + EveryKmer{S, A <: Alphabet, K} + +Iterator of every forward kmer. `S` signifies the type of the underlying sequence, +and the eltype of the iterator is `Kmer{A, K, N}` with the appropriate `N`. + +Can be constructed more conventiently with the constructors `EveryDNAMer{S, K}(s)` +and `EveryDNAMer{K}(s)`, and similar also for `EveryRNAMer` and `EveryAAMer`. + +If `A <: Union{DNAAlphabet{2}, RNAAlphabet{2}}` and +`Alphabet(S) isa Union{DNAAlphabet{4}, RNAAlphabet{4}}`, the iterator skips all +kmers containing symbols not permitted in the 2-bit nucleotide alphabet. + +# Examples: +```jldoctest +julia> v = collect(EveryDNAMer{3}("AGCGTATA")); + +julia eltype(v), length(v) +(Kmer{DNAAlphabet{2}, 3, 1}, 6) + +julia> length(collect(EveryRNAMer{3}(rna"UGDCUGAVC"))) +2 +``` +""" +struct EveryKmer{S, A <: Alphabet, K} <: AbstractKmerIterator{A, K} + seq::S +end + +# Constructors +EveryKmer{A, K}(s) where {A <: Alphabet, K} = EveryKmer{typeof(s), A, K} +const EveryDNAMer{S, K} = EveryKmer{S, DNAAlphabet{2}, K} +const EveryRNAMer{S, K} = EveryKmer{S, RNAAlphabet{2}, K} +const EveryAAMer{S, K} = EveryKmer{S, AminoAcidAlphabet, K} + +EveryDNAMer{K}(s) where K = EveryDNAMer{typeof(s), K}(s) +EveryRNAMer{K}(s) where K = EveryRNAMer{typeof(s), K}(s) +EveryAAMer{K}(s) where K = EveryAAMer{typeof(s), K}(s) + +function EveryKmer{S, A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} + s2 = codeunits(s) + EveryKmer{typeof(s2), A, K}(s2) +end + +const SameEveryKmer{S, A, K} = EveryKmer{S, A} where {A, S <: BioSequence{A}} +const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} +const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} + +# Known length if every symbol of the sequence can be represented in the kmer +Base.IteratorSize(::Type{<:SameEveryKmer}) = Base.HasLength() +Base.IteratorSize(::Type{<:EveryKmer{<:BioSequence{<:TwoBit}, <:FourBit}}) = Base.HasLength() + +function Base.length(it::SameEveryKmer{S, A, K}) where {S, A, K} + length(it.seq) - K + 1 +end + +# These methods can carry the encoding directly over +function Base.iterate(it::EveryKmer{S, A, K}) where {A, K, S <: BioSequence{A}} + seq = it.seq + length(seq) < K && return nothing + data = zero_tuple(eltype(it)) + bps = BioSequences.bits_per_symbol(A()) + for i in 1:K + encoding = UInt(BioSequences.extract_encoded_element(seq, i)) + (_, data) = leftshift_carry(data, bps, encoding) + end + kmer = eltype(it)(unsafe, data) + (kmer, (kmer, K+1)) +end + +function Base.iterate(it::EveryKmer{S, A, K}, state::Tuple{Kmer, Integer}) where {A, K, S <: BioSequence{A}} + seq = it.seq + (kmer, i) = state + i > length(seq) && return nothing + bps = BioSequences.bits_per_symbol(A()) + encoding = UInt(BioSequences.extract_encoded_element(seq, i)) + # TODO: This shares code with pushlast, might want to refactor that. + (_, new_data) = leftshift_carry(kmer.data, bps, encoding) + (head, tail...) = new_data + new_kmer = typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) + (new_kmer, (new_kmer, i+1)) +end + +# These methods can use special 2 -> 4 bit recoding +@inline recode(encoding::UInt) = left_shift(UInt(1), encoding) + +function Base.iterate(it::EveryKmer{S, <:FourBit, K}) where {S <: BioSequence{<:TwoBit}, K} + seq = it.seq + length(seq) < K && return nothing + data = zero_tuple(eltype(it)) + for i in 1:K + encoding = recode(UInt(BioSequences.extract_encoded_element(seq, i))) + (_, data) = leftshift_carry(data, 4, encoding) + end + kmer = eltype(it)(unsafe, data) + (kmer, (kmer, K+1)) +end + +# TODO: Lots of code sharing in this file... can we refactor to be more clever? +function Base.iterate(it::EveryKmer{S, <:FourBit}, state::Tuple{Kmer, Integer}) where {S <: BioSequence{<:TwoBit}} + seq = it.seq + (kmer, i) = state + i > length(seq) && return nothing + encoding = recode(UInt(BioSequences.extract_encoded_element(seq, i))) + (_, new_data) = leftshift_carry(kmer.data, 4, encoding) + (head, tail...) = new_data + new_kmer = typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) + (new_kmer, (new_kmer, i+1)) +end + +# This is special because, by convention, we skip every ambiguous kmer +# instead of erroring. +function Base.iterate( + it::EveryKmer{S, A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) +) where {A <: TwoBit, S <: BioSequence{<:FourBit}, K} + (kmer, remaining, i) = state + data = kmer.data + seq = it.seq + while !iszero(remaining) + i > length(seq) && return nothing + # TODO: Also, LUT here? + encoding = UInt(BioSequences.extract_encoded_element(seq, i)) + i += 1 + # TODO: Is lookup table faster? + remaining = ifelse(isone(count_ones(encoding)), remaining - 1, K) + (_, new_data) = leftshift_carry(data, 2, trailing_zeros(encoding) % UInt) + (head, tail...) = new_data + data = (head & get_mask(typeof(kmer)), tail...) + end + new_kmer = typeof(kmer)(unsafe, data) + return (new_kmer, (new_kmer, 1, i)) +end + +const BYTE_LUT = let + v = fill(0xff, 256) + for (i, s) in [(0, "Aa"), (1, "cC"), (2, "gG"), (3, "TtUu")], c in s + v[UInt8(c) + 1] = i + end + for c in "-MRSVWYHKDBN" + v[UInt8(c) + 1] = 0xf0 + v[UInt8(lowercase(c)) + 1] = 0xf0 + end + Tuple(v) +end + +# TODO: Change to lazy_str when new Julia LTS drops after 1.6 +@noinline throw_bad_byte_error(b::UInt8) = error("Cannot interpret byte $(repr(b)) as nucleotide") + +function Base.iterate( + it::EveryKmer{S, A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) +) where {A <: TwoBit, S <: AbstractVector{UInt8}, K} + (kmer, remaining, i) = state + data = kmer.data + seq = it.seq + Base.require_one_based_indexing(seq) + while !iszero(remaining) + i > length(seq) && return nothing + byte = @inbounds seq[i] + i += 1 + encoding = @inbounds BYTE_LUT[byte + 0x01] + encoding == 0xff && throw_bad_byte_error(byte) + remaining = ifelse(encoding == 0xf0, K, remaining - 1) + (_, new_data) = leftshift_carry(data, 2, encoding % UInt) + (head, tail...) = new_data + data = (head & get_mask(typeof(kmer)), tail...) + end + new_kmer = typeof(kmer)(unsafe, data) + return (new_kmer, (new_kmer, 1, i)) +end \ No newline at end of file diff --git a/src/iterators/common.jl b/src/iterators/common.jl new file mode 100644 index 0000000..9d8bbae --- /dev/null +++ b/src/iterators/common.jl @@ -0,0 +1,7 @@ +abstract type AbstractKmerIterator{A <: Alphabet, K} end + +function Base.eltype(::Type{<:AbstractKmerIterator{A, K}}) where {A, K} + Kmer{A, K, n_coding_elements(Kmer{A, K})} +end + +Base.IteratorSize(::Type{<:AbstractKmerIterator}) = Base.SizeUnknown() \ No newline at end of file diff --git a/src/kmer.jl b/src/kmer.jl index ca3f50e..4233124 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -1,18 +1,31 @@ -# Notes about Kmers's representation: -# Each element is encoded in the same way as a LongSequence, however the order -# is different. In a Kmer, the elements fill from MSB to LSB, from first to -# last tuple index. Unused bits are always zeroed. -# This layout complicates some Kmer construction code, but simplifies comparison -# operators, and we really want Kmers to be efficient. - """ Kmer{A<:Alphabet,K,N} <: BioSequence{A} A parametric, immutable, bitstype for representing k-mers - short sequences -of a fixed length K. - +of a fixed length `K`. Since they can be stored directly in registers, `Kmer`s are generally the most efficient type of `BioSequence`, when `K` is small and known at compile time. +The `N` parameter is derived from `A` and `K` and is not a free parameter. + +# Examples +```jldoctest +julia> m = Kmer{DNAAlphabet{4}}("AGCKN") # type-unstable +DNA 5-mer +AGCKN + +julia> length(m) == 5 +true + +julia> DNAKmer(dna"TGCTTA") isa DNAKmer{6} +true + +julia> AAKmer((lowercase(i) for i in "KLWYR")) isa AAKmer{5} +true + +julia> RNAKmer{3}("UA") +ERROR: +[ ... ] +``` """ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} # The number of UInt is always exactly the number needed, no less, no more. @@ -23,6 +36,9 @@ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} # ( ABC, DEFG) # ^ 16 unused bits, the unused bits are always top bits of first UInt # Unused bits are always zero + + # This layout complicates some Kmer construction code, but simplifies comparison + # operators, and we really want Kmers to be efficient. data::NTuple{N, UInt} # This unsafe method do not clip the head @@ -51,10 +67,7 @@ const RNACodon = RNAKmer{3, 1} """ check_kmer(::Type{Kmer{A,K,N}}) where {A,K,N} -Internal method - enforces good kmer type parameterisation. - -For a given Kmer{A,K,N} of length K, the number of words used to -represent it (N) should be the minimum needed to contain all K symbols. +Internal methods that checks that the type parameters are good. This function should compile to a noop in case the parameterization is good. """ @@ -101,12 +114,20 @@ end per_word_capacity(Kmer{A, K, N}) - n_unused(Kmer{A, K, N}) end +@inline derive_type(::Type{Kmer{A, K}}) where {A, K} = Kmer{A, K, n_coding_elements(Kmer{A, K})} + ################################################ # Constructors ################################################ zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), Val{nsize(T)}()) +# TODO: Should this somehow throw a MethodError if N is already parameterized? +function zero_kmer(T::Type{Kmer{A, K}}) where {A, K} + T2 = derive_type(Kmer{A, K}) + T2(unsafe, zero_tuple(T2)) +end + # Generic, unknown size @inline function construct_generic(::Base.SizeUnknown, T::Type{<:Kmer{A, K}}, itr) where {A, K} check_kmer(T) @@ -181,25 +202,38 @@ end # Derived constructors ################################################ -# Where the parameters of the kmer is not specified in the constructor -function Kmer(s::BioSequence{A}) where A - K = length(s) - N = n_coding_elements(Kmer{A, K}) - Kmer{A, K, N}(s) +# BioSequence: Various missing type parameters +Kmer{A, K}(s::BioSequence) where {A, K} = derive_type(Kmer{A, K})(s) +Kmer{A}(s::BioSequence) where A = derive_type(Kmer{A, length(s)})(s) +Kmer(s::BioSequence{A}) where A = derive_type(Kmer{A, length(s)})(s) + +# Iterators: Various missing type parameters. +# It's too impractical to construct a kmer before we know the value of K, +# so either the iterator must have a known length, or else we need to collect +# it first +Kmer{A, K}(itr) where {A, K} = Kmer{A, K}(Base.IteratorSize(itr), itr) +Kmer{A, K}(::Base.SizeUnknown, itr) where {A, K} = Kmer{A, K}(collect(itr)) + +function Kmer{A, K}(iT::Union{Base.HasLength, Base.HasShape}, itr) where {A, K} + length(itr) == K || error("Length of sequence must be K elements to build Kmer") + construct_generic_unchecked(iT, derive_type(Kmer{A, K}), itr) end -# Where A, but not K is specified +Kmer{A}(itr) where A = Kmer{A}(Base.IteratorSize(itr), itr) +Kmer{A}(::Base.SizeUnknown, itr) where A = Kmer{A}(vec(collect(itr))) + +function Kmer{A}(iT::Union{Base.HasLength, Base.HasShape}, itr) where A + construct_generic_unchecked(iT, derive_type(Kmer{A, length(itr)}), itr) +end + +# Strings: Various missing type parameters function Kmer{A}(s::Union{String, SubString{String}}) where A - K = length(s) - N = n_coding_elements(Kmer{A, K}) - construct_generic_unchecked(Base.HasLength(), Kmer{A, K, N}, s) + construct_generic_unchecked(Base.HasLength(), derive_type(Kmer{A, length(s)}), s) end -# With a different A -function Kmer{A}(s::BioSequence) where A - K = length(s) - N = n_coding_elements(Kmer{A, K}) - Kmer{A, K, N}(s) +function Kmer{A, K}(s::Union{String, SubString{String}}) where {A, K} + length(s) == K || error("Length of sequence must be K elements to build Kmer") + construct_generic_unchecked(Base.HasLength(), derive_type(Kmer{A, K}), s) end # TODO: Constructor from LongSequence and LongSubSeq @@ -288,9 +322,9 @@ function pushlast(kmer::Kmer{A}, s::BioSequences.BioSymbol) where A typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) end -######################################## -# Various bit-twiddling useful functions -######################################## +################################################# +# Various bit-twiddling useful functions on kmers +################################################# # Get a mask 0x0001111 ... masking away the unused bits of the head element # in the UInt tuple diff --git a/src/kmer_iteration/AbstractKmerIterator.jl b/src/old_iter/AbstractKmerIterator.jl similarity index 100% rename from src/kmer_iteration/AbstractKmerIterator.jl rename to src/old_iter/AbstractKmerIterator.jl diff --git a/src/kmer_iteration/EveryCanonicalKmer.jl b/src/old_iter/EveryCanonicalKmer.jl similarity index 100% rename from src/kmer_iteration/EveryCanonicalKmer.jl rename to src/old_iter/EveryCanonicalKmer.jl diff --git a/src/kmer_iteration/EveryKmer.jl b/src/old_iter/EveryKmer.jl similarity index 100% rename from src/kmer_iteration/EveryKmer.jl rename to src/old_iter/EveryKmer.jl diff --git a/src/kmer_iteration/SpacedCanonicalKmers.jl b/src/old_iter/SpacedCanonicalKmers.jl similarity index 100% rename from src/kmer_iteration/SpacedCanonicalKmers.jl rename to src/old_iter/SpacedCanonicalKmers.jl diff --git a/src/kmer_iteration/SpacedKmers.jl b/src/old_iter/SpacedKmers.jl similarity index 100% rename from src/kmer_iteration/SpacedKmers.jl rename to src/old_iter/SpacedKmers.jl diff --git a/src/transformations.jl b/src/transformations.jl index 4ad7651..89e8893 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -17,7 +17,7 @@ function BioSequences.complement(x::Kmer{<:Union{DNAAlphabet{4}, RNAAlphabet{4}} typeof(x)(unsafe, data) end -# For this method we do +# For this method we do need to mask unused bits, unlike above function BioSequences.complement(x::Kmer{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}) isempty(x) && return x data = map(i -> BioSequences.complement_bitpar(i, Alphabet(x)), x.data) From 1293e69a90268b72f34b8562e0d61691e7a1e807 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 22 Jul 2023 15:48:01 +0200 Subject: [PATCH 08/33] Fixup README.md --- README.md | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index ed51eff..17676b6 100644 --- a/README.md +++ b/README.md @@ -7,22 +7,26 @@ ## Description +Kmers provide the `Kmer <: BioSequence` type which implement the concept of a +[k-mer](https://en.wikipedia.org/wiki/K-mer). -Kmers provides a specialised concrete `BioSequence` subtype, optimised for -representing short immutable sequences called kmers: contiguous sub-strings of k -nucleotides of some reference sequence. +A k-mer is a biological sequence of exactly length `k`. They are used commonly +in bioinformatics because, when k is small and known at compile time, these +sequences can be efficiently represented as integers and stored directly in +CPU registers, allowing for much more efficient computation than arbitrary-length +sequences. -They are used extensively in bioinformatic analyses as an informational unit. -This concept was popularised by short read assemblers. -Analyses within the kmer space benefit from a simple formulation of the sampling -problem and direct in-hash comparisons. +Conceptually, one can think of the following analogy: +* `BioSequence` is like `AbstractString` and `AbstractVector` +* `LongSequence` is like `String` and `Vector` +* `Kmer` is like `[InlineString](https://github.com/JuliaStrings/InlineStrings.jl)` + and `[SVector](https://github.com/JuliaArrays/StaticArrays.jl)` -Kmers provides the type representing kmers as well as the implementations of -the APIs specified by the -[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package. +Kmers.jl is tightly coupled to the +[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package, +and rely on its internals. ## Installation - You can install BioSequences from the julia REPL. Press `]` to enter pkg mode, and enter the following: @@ -33,27 +37,14 @@ add Kmers If you are interested in the cutting edge of the development, please check out the master branch to try new features before release. - -## Testing - -Kmers is tested against Julia `1.X` on Linux, OS X, and Windows. - -[![Unit tests](https://github.com/BioJulia/Kmers.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/Kmers.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster) -[![Documentation](https://github.com/BioJulia/Kmers.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/BioKmers.jl/actions?query=workflow%3ADocumentation+branch%3Amaster) -[![](https://codecov.io/gh/BioJulia/Kmers.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/Kmers.jl) - - ## Contributing - We appreciate contributions from users including reporting bugs, fixing issues, improving performance and adding new features. Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct. - ## Questions? - If you have a question about contributing or using BioJulia software, come -on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the +on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). From 82b4767b3fd6f2237af85a86b06861fd1c574cea Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 24 Jul 2023 14:28:18 +0200 Subject: [PATCH 09/33] More stuff --- README.md | 15 +++-- src/Kmers.jl | 87 +++++++++++-------------- src/indexing.jl | 8 +-- src/iterators/EveryKmer.jl | 26 ++------ src/kmer.jl | 113 +++++++++++++++++++++++++++++---- test/biosequences_interface.jl | 51 --------------- test/runtests.jl | 45 ++++++++----- 7 files changed, 187 insertions(+), 158 deletions(-) delete mode 100644 test/biosequences_interface.jl diff --git a/README.md b/README.md index 17676b6..14a219e 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,11 @@ [![Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/Kmers.jl/stable) [![Pkg Status](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) - ## Description Kmers provide the `Kmer <: BioSequence` type which implement the concept of a [k-mer](https://en.wikipedia.org/wiki/K-mer). -A k-mer is a biological sequence of exactly length `k`. They are used commonly +A k-mer is a biological sequence of exactly length `k`. k-mers are used frequently in bioinformatics because, when k is small and known at compile time, these sequences can be efficiently represented as integers and stored directly in CPU registers, allowing for much more efficient computation than arbitrary-length @@ -19,12 +18,20 @@ sequences. Conceptually, one can think of the following analogy: * `BioSequence` is like `AbstractString` and `AbstractVector` * `LongSequence` is like `String` and `Vector` -* `Kmer` is like `[InlineString](https://github.com/JuliaStrings/InlineStrings.jl)` - and `[SVector](https://github.com/JuliaArrays/StaticArrays.jl)` +* `Kmer` is like [`InlineString`](https://github.com/JuliaStrings/InlineStrings.jl) + and [`SVector`](https://github.com/JuliaArrays/StaticArrays.jl) Kmers.jl is tightly coupled to the [`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package, and rely on its internals. +Hence, you should expect strict compat bounds on BioSequences.jl + +## Usage + +### ⚠️ WARNING ⚠️ +`Kmer`s are parameterized by their length. That means any operation on `Kmer`s that change their length, such as `push`, `pop`, slicing, or masking (logical indexing) will be **type unstable** and hence slow and memory inefficient, unless you make use of the compiler's constant propagation capabilities. + +Kmers.jl is intended for high-performance computing. If you do not need the extra performance that register-stored sequences provide, you should use the `LongSequence` from BioSequences.jl instead ## Installation You can install BioSequences from the julia diff --git a/src/Kmers.jl b/src/Kmers.jl index 5c74689..2796b76 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -5,11 +5,40 @@ # # This file is a part of the Kmers.jl, a package in the BioJulia ecosystem. # License is MIT: https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE - module Kmers export - # BioSymbols re-exports. + Kmer, + + DNAKmer, + RNAKmer, + AAKmer, + + DNACodon, + RNACodon, + + ReverseGeneticCode, + reverse_translate, + reverse_translate!, + + @mer_str, + + # Immutable operations + push, + pushfirst, + q_push, + q_pushfirst, + + # Iterators + EveryKmer, + EveryDNAMer, + EveryRNAMer, + EveryAAMer, + + ################## + # Re-exports + ################## + # BioSymbols re-exports NucleicAcid, DNA, RNA, @@ -87,54 +116,17 @@ export DNAAlphabet, RNAAlphabet, translate, + complement, + reverse_complement - ### - ### Mers - ### - - # Type & aliases - Kmer, - DNAKmer, - DNA27mer, - DNA31mer, - DNA63mer, - RNAKmer, - RNA27mer, - RNA31mer, - RNA63mer, - AAKmer, - DNACodon, - RNACodon, - - # Iteration - EveryKmer, - SpacedKmers, - EveryCanonicalKmer, - SpacedCanonicalKmers, - fw_neighbors, - bw_neighbors, - - # Immutable operators - push, - delete, - - # Translation - reverse_translate, - reverse_translate!, - ReverseGeneticCode, - - ### - ### Sequence literals - ### - @mer_str, - @bigmer_str - +# Kmers.jl is tightly coupled to BioSequences and relies on much of its internals. +# Hence, we do not care about carefully importing specific symbols using BioSequences """ Kmers.Unsafe -Trait object used to access unsafe methods of functions. +Internal trait object used to access unsafe methods of functions. `unsafe` is the singleton of `Unsafe`. """ struct Unsafe end @@ -148,12 +140,5 @@ include("revtrans.jl") include("iterators/common.jl") include("iterators/EveryKmer.jl") -#= -include("kmer_iteration/AbstractKmerIterator.jl") -include("kmer_iteration/EveryKmer.jl") -include("kmer_iteration/SpacedKmers.jl") -include("kmer_iteration/EveryCanonicalKmer.jl") -include("kmer_iteration/SpacedCanonicalKmers.jl") -=# end # module diff --git a/src/indexing.jl b/src/indexing.jl index fcfa948..e961705 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -1,6 +1,6 @@ -@inline function BioSequences.extract_encoded_element(seq::Kmer{A}, i::Integer) where A +@inline function BioSequences.extract_encoded_element(seq::Kmer, i::Integer) T = typeof(seq) - bps = BioSequences.bits_per_symbol(A()) % UInt + bps = BioSequences.bits_per_symbol(seq) % UInt index = div((i + n_unused(T) - 1) % UInt, per_word_capacity(T) % UInt) + 1 offset = mod(((elements_in_head(T) - i) * bps) % UInt, 8 * sizeof(UInt)) mask = UInt(1) << bps - 1 @@ -37,7 +37,7 @@ end end @inline function BioSequences.bitindex(kmer::Kmer, i::Unsigned)::Tuple{UInt, UInt} - bps = BioSequences.bits_per_symbol(Alphabet(kmer)) % UInt + bps = BioSequences.bits_per_symbol(kmer) % UInt bpe = (8 * sizeof(UInt)) % UInt (i, o) = divrem((UInt(i) - UInt(1) + n_unused(typeof(kmer))) * bps, bpe) o = bpe - o - bps @@ -46,7 +46,7 @@ end @inline function setindex(kmer::Kmer, i::Integer, s) @boundscheck checkbounds(kmer, i) - bps = BioSequences.bits_per_symbol(Alphabet(kmer)) + bps = BioSequences.bits_per_symbol(kmer) symbol = convert(eltype(kmer), s) encoding = UInt(BioSequences.encode(Alphabet(kmer), symbol)) (i, o) = BioSequences.bitindex(kmer, i % UInt) diff --git a/src/iterators/EveryKmer.jl b/src/iterators/EveryKmer.jl index 434ccf3..ab6f784 100644 --- a/src/iterators/EveryKmer.jl +++ b/src/iterators/EveryKmer.jl @@ -71,12 +71,8 @@ function Base.iterate(it::EveryKmer{S, A, K}, state::Tuple{Kmer, Integer}) where seq = it.seq (kmer, i) = state i > length(seq) && return nothing - bps = BioSequences.bits_per_symbol(A()) encoding = UInt(BioSequences.extract_encoded_element(seq, i)) - # TODO: This shares code with pushlast, might want to refactor that. - (_, new_data) = leftshift_carry(kmer.data, bps, encoding) - (head, tail...) = new_data - new_kmer = typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) + new_kmer = q_push_encoding(kmer, encoding) (new_kmer, (new_kmer, i+1)) end @@ -101,9 +97,7 @@ function Base.iterate(it::EveryKmer{S, <:FourBit}, state::Tuple{Kmer, Integer}) (kmer, i) = state i > length(seq) && return nothing encoding = recode(UInt(BioSequences.extract_encoded_element(seq, i))) - (_, new_data) = leftshift_carry(kmer.data, 4, encoding) - (head, tail...) = new_data - new_kmer = typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) + new_kmer = q_push_encoding(kmer, encoding) (new_kmer, (new_kmer, i+1)) end @@ -113,7 +107,6 @@ function Base.iterate( it::EveryKmer{S, A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) ) where {A <: TwoBit, S <: BioSequence{<:FourBit}, K} (kmer, remaining, i) = state - data = kmer.data seq = it.seq while !iszero(remaining) i > length(seq) && return nothing @@ -122,12 +115,9 @@ function Base.iterate( i += 1 # TODO: Is lookup table faster? remaining = ifelse(isone(count_ones(encoding)), remaining - 1, K) - (_, new_data) = leftshift_carry(data, 2, trailing_zeros(encoding) % UInt) - (head, tail...) = new_data - data = (head & get_mask(typeof(kmer)), tail...) + kmer = q_push_encoding(kmer, trailing_zeros(encoding) % UInt) end - new_kmer = typeof(kmer)(unsafe, data) - return (new_kmer, (new_kmer, 1, i)) + return (kmer, (kmer, 1, i)) end const BYTE_LUT = let @@ -149,7 +139,6 @@ function Base.iterate( it::EveryKmer{S, A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) ) where {A <: TwoBit, S <: AbstractVector{UInt8}, K} (kmer, remaining, i) = state - data = kmer.data seq = it.seq Base.require_one_based_indexing(seq) while !iszero(remaining) @@ -159,10 +148,7 @@ function Base.iterate( encoding = @inbounds BYTE_LUT[byte + 0x01] encoding == 0xff && throw_bad_byte_error(byte) remaining = ifelse(encoding == 0xf0, K, remaining - 1) - (_, new_data) = leftshift_carry(data, 2, encoding % UInt) - (head, tail...) = new_data - data = (head & get_mask(typeof(kmer)), tail...) + kmer = q_push_encoding(kmer, encoding % UInt) end - new_kmer = typeof(kmer)(unsafe, data) - return (new_kmer, (new_kmer, 1, i)) + return (kmer, (kmer, 1, i)) end \ No newline at end of file diff --git a/src/kmer.jl b/src/kmer.jl index 4233124..6c5f900 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -94,7 +94,7 @@ end @inline ksize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = K @inline nsize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = N @inline n_unused(::Type{<:Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K -@inline bits_unused(T::Type{<:Kmer{A}}) where A = n_unused(T) * BioSequences.bits_per_symbol(A()) +@inline bits_unused(T::Type{<:Kmer})= n_unused(T) * BioSequences.bits_per_symbol(T) @inline BioSequences.Alphabet(::Kmer{A}) where A = A() @@ -175,6 +175,17 @@ end T(unsafe, data) end +# With LongSequence of the same alphabet, entire coding elements can be copied +# directly. +# TODO: Test that LongSequence and LongSubSeq encoded_data_eltype is UInt +@inline function construct_unchecked(T::Type{<:Kmer{A}}, s::LongSequence{A}, data_eltype::Type{UInt}) where {A <: Alphabet} + check_kmer(T) + Bps = BioSequences.BitsPerSymbol(A()) + data = ntuple(i -> BioSequences.reversebits(@inbounds(s.data[i]), Bps), Val{nsize(T)}()) + (_, data) = rightshift_carry(data, bits_unused(T), zero(UInt)) + T(unsafe, data) +end + # BioSequence with another element type fall back to the generic length constructor @inline function construct_unchecked(T::Type{<:Kmer}, s::BioSequence, data_eltype::Type) construct_generic_unchecked(Base.HasLength(), T, s) @@ -236,7 +247,7 @@ function Kmer{A, K}(s::Union{String, SubString{String}}) where {A, K} construct_generic_unchecked(Base.HasLength(), derive_type(Kmer{A, K}), s) end -# TODO: Constructor from LongSequence and LongSubSeq +# TODO: Constructor from LongSubSeq # where whole coding elements can be copied directly over # without extracting individual elements @@ -305,21 +316,101 @@ Base.isequal(x::BioSequence, y::Kmer) = false # See https://github.com/BioJulia/BioSequences.jl/pull/121#discussion_r475234270 Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) -function push(kmer::Kmer{A}, s::BioSequences.BioSymbol) where A +function push(kmer::Kmer, s) + bps = BioSequences.bits_per_symbol(kmer) + newT = derive_type(Kmer{A, length(kmer)+1}) + # If no free space in data, add new tuple + new_data = if n_unused(typeof(kmer)) < bps + (zero(UInt), kmer.data...) + else + kmer.data + end + # leftshift_carry the new encoding in. + encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + (_, new_data) = leftshift_carry(new_data, bps, encoding) + newT(unsafe, new_data) +end + +""" + q_push(kmer::kmer, symbol)::typeof(kmer) + +Push `symbol` onto the end of `kmer`, and pop the first symbol in `kmer`. + +# Examples +```jldoctest +julia> q_push(mer"TACC"d, DNA_A) +DNA 4-mer +ACCA + +julia> q_push(mer"WKYMLPIIRS"aa, AA_F) +AminoAcid 10-mer +KYMLPIIRSF +``` +""" +function q_push(kmer::Kmer{A}, s) where A + encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + q_push_encoding(kmer, encoding) +end + +@inline function q_push_encoding(kmer::Kmer, encoding::UInt) + bps = BioSequences.bits_per_symbol(kmer) + (_, new_data) = leftshift_carry(kmer.data, bps, encoding) + (head, tail...) = new_data + typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) +end + +function pushfirst(kmer::Kmer{A}, s) where A bps = BioSequences.bits_per_symbol(A()) + newT = derive_type(Kmer{A, length(kmer)+1}) + # If no free space in data, add new tuple + new_data = if n_unused(typeof(kmer)) < bps + (zero(UInt), kmer.data...) + else + kmer.data + end + (head, tail...) = new_data encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) - (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) + head |= left_shift(encoding, (elements_in_head(newT) - 1) * bps) + newT(unsafe, (head, tail...)) +end + +""" + q_pushfirst(kmer::kmer, symbol)::typeof(kmer) + +Push `symbol` onto the start of `kmer`, and pop the last symbol in `kmer`. + +# Examples +```jldoctest +julia> q_pushfirst(mer"TACC"d, DNA_A) +DNA 4-mer +ATAC + +julia> q_pushfirst(mer"WKYMLPIIRS"aa, AA_F) +AminoAcid 10-mer +FWKYMLPIIR +``` +""" +function q_pushfirst(kmer::Kmer{A}, s) where A + bps = BioSequences.bits_per_symbol(A()) + encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + (_, new_data) = rightshift_carry(kmer.data, bps, encoding) (head, tail...) = new_data - head = head | left_shift(encoding, (elements_in_head(typeof(kmer)) - 1) * bps) + head |= left_shift(encoding, (elements_in_head(typeof(kmer)) - 1) * bps) typeof(kmer)(unsafe, (head, tail...)) end -function pushlast(kmer::Kmer{A}, s::BioSequences.BioSymbol) where A +function pop(kmer::Kmer{A}) where A + isempty(kmer) && throw(ArgumentError("Cannot pop 0-mer")) bps = BioSequences.bits_per_symbol(A()) - encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) - (_, new_data) = leftshift_carry(kmer.data, bps, encoding) - (head, tail...) = new_data - typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) + newT = derive_type(Kmer{A, length(kmer)-1}) + (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) + new_data = if elements_in_head(typeof(kmer)) == 1 + (head, tail...) = new_data + tail + else + new_data + end + newT(unsafe, new_data) end ################################################# @@ -329,5 +420,5 @@ end # Get a mask 0x0001111 ... masking away the unused bits of the head element # in the UInt tuple @inline function get_mask(T::Type{<:Kmer}) - UInt(1) << bits_unused(T) - 1 + UInt(1) << (8*sizeof(UInt) - bits_unused(T)) - 1 end \ No newline at end of file diff --git a/test/biosequences_interface.jl b/test/biosequences_interface.jl deleted file mode 100644 index a52680b..0000000 --- a/test/biosequences_interface.jl +++ /dev/null @@ -1,51 +0,0 @@ -@testset "BioSequences Interface" begin - @test BioSequences.has_interface( - BioSequence, - Kmers.kmertype(Kmer{DNAAlphabet{2}, 31}), - rand(ACGT, 31), - false, - ) - @test BioSequences.has_interface( - BioSequence, - Kmers.kmertype(Kmer{DNAAlphabet{4}, 31}), - rand(ACGT, 31), - false, - ) - @test BioSequences.has_interface( - BioSequence, - Kmers.kmertype(Kmer{RNAAlphabet{2}, 31}), - rand(ACGU, 31), - false, - ) - @test BioSequences.has_interface( - BioSequence, - Kmers.kmertype(Kmer{RNAAlphabet{4}, 31}), - rand(ACGU, 31), - false, - ) - - @test BioSequences.has_interface( - BioSequence, - Kmers.kmertype(Kmer{DNAAlphabet{2}, 200}), - rand(ACGT, 200), - false, - ) - @test BioSequences.has_interface( - BioSequence, - Kmers.kmertype(Kmer{DNAAlphabet{4}, 200}), - rand(ACGT, 200), - false, - ) - @test BioSequences.has_interface( - BioSequence, - Kmers.kmertype(Kmer{RNAAlphabet{2}, 200}), - rand(ACGU, 200), - false, - ) - @test BioSequences.has_interface( - BioSequence, - Kmers.kmertype(Kmer{RNAAlphabet{4}, 200}), - rand(ACGU, 200), - false, - ) -end diff --git a/test/runtests.jl b/test/runtests.jl index 0dc2c86..60ef01d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,25 +4,36 @@ using Kmers using BioSequences using Test -const GROUP = get(ENV, "GROUP", "All") - include("utils.jl") -if GROUP == "BioSequences" || GROUP == "All" - include("biosequences_interface.jl") - include("construction_and_conversion.jl") - include("comparisons.jl") - include("length.jl") - include("access.jl") - include("random.jl") - include("find.jl") - include("print.jl") - include("transformations.jl") - include("mismatches.jl") - include("debruijn_neighbors.jl") - include("iteration.jl") - include("translation.jl") - #include("shuffle.jl") +@testset "BioSequences Interface" begin + for A in [DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4}, AminoAcidAlphabet] + for K in (1, 9, 116) + @test BioSequences.has_interface( + BioSequence, + Kmers.derive_type(Kmer{A, K}), + rand(collect(A()), K), + false, + ) + end + end +end + +@testset "Construction" begin end +# include("construction_and_conversion.jl") +# include("comparisons.jl") +# include("length.jl") +# include("access.jl") +# include("random.jl") +# include("find.jl") +# include("print.jl") +# include("transformations.jl") +# include("mismatches.jl") +# include("debruijn_neighbors.jl") +# include("iteration.jl") +# include("translation.jl") +#include("shuffle.jl") + end # module From 603c5918c165a1cf9443aba92d5f2eef1ea31a36 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 25 Sep 2023 20:09:34 +0200 Subject: [PATCH 10/33] Fixup README --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 14a219e..37f4b6a 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ sequences can be efficiently represented as integers and stored directly in CPU registers, allowing for much more efficient computation than arbitrary-length sequences. -Conceptually, one can think of the following analogy: +Conceptually, one may use the following analogy: * `BioSequence` is like `AbstractString` and `AbstractVector` * `LongSequence` is like `String` and `Vector` * `Kmer` is like [`InlineString`](https://github.com/JuliaStrings/InlineStrings.jl) @@ -24,14 +24,14 @@ Conceptually, one can think of the following analogy: Kmers.jl is tightly coupled to the [`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package, and rely on its internals. -Hence, you should expect strict compat bounds on BioSequences.jl +Hence, you should expect strict compat bounds on BioSequences.jl. ## Usage ### ⚠️ WARNING ⚠️ -`Kmer`s are parameterized by their length. That means any operation on `Kmer`s that change their length, such as `push`, `pop`, slicing, or masking (logical indexing) will be **type unstable** and hence slow and memory inefficient, unless you make use of the compiler's constant propagation capabilities. +`Kmer`s are parameterized by their length. That means any operation on `Kmer`s that change their length, such as `push`, `pop`, slicing, or masking (logical indexing) will be **type unstable** and hence slow and memory inefficient, unless you write your code in such as way that the compiler can use constant folding. -Kmers.jl is intended for high-performance computing. If you do not need the extra performance that register-stored sequences provide, you should use the `LongSequence` from BioSequences.jl instead +Kmers.jl is intended for high-performance computing. If you do not need the extra performance that register-stored sequences provide, you might consider using `LongSequence` from BioSequences.jl instead ## Installation You can install BioSequences from the julia @@ -41,7 +41,7 @@ REPL. Press `]` to enter pkg mode, and enter the following: add Kmers ``` -If you are interested in the cutting edge of the development, please check out +If you are interested in the cutting edge of development, please check out the master branch to try new features before release. ## Contributing From a3ecda48b4c32dbfa8d81d8e27a3e4a307dbc2f7 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 25 Sep 2023 20:46:29 +0200 Subject: [PATCH 11/33] Begin EveryCanonicalKmer --- src/Kmers.jl | 5 +- src/indexing.jl | 4 +- src/iterators/EveryCanonicalKmer.jl | 87 +++++++++++++++++ src/iterators/EveryKmer.jl | 145 +++++++++++++++++----------- src/iterators/common.jl | 19 +++- src/kmer.jl | 32 +++--- src/transformations.jl | 6 +- 7 files changed, 222 insertions(+), 76 deletions(-) create mode 100644 src/iterators/EveryCanonicalKmer.jl diff --git a/src/Kmers.jl b/src/Kmers.jl index 2796b76..402b677 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -26,8 +26,8 @@ export # Immutable operations push, pushfirst, - q_push, - q_pushfirst, + shift, + shift_first, # Iterators EveryKmer, @@ -140,5 +140,6 @@ include("revtrans.jl") include("iterators/common.jl") include("iterators/EveryKmer.jl") +include("iterators/EveryCanonicalKmer.jl") end # module diff --git a/src/indexing.jl b/src/indexing.jl index e961705..2017985 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -1,6 +1,6 @@ @inline function BioSequences.extract_encoded_element(seq::Kmer, i::Integer) T = typeof(seq) - bps = BioSequences.bits_per_symbol(seq) % UInt + bps = BioSequences.bits_per_symbol(Alphabet(seq)) % UInt index = div((i + n_unused(T) - 1) % UInt, per_word_capacity(T) % UInt) + 1 offset = mod(((elements_in_head(T) - i) * bps) % UInt, 8 * sizeof(UInt)) mask = UInt(1) << bps - 1 @@ -55,4 +55,4 @@ end element &= ~mask element |= left_shift(encoding, o) typeof(kmer)(unsafe, @inbounds Base.setindex(kmer.data, element, i)) -end \ No newline at end of file +end diff --git a/src/iterators/EveryCanonicalKmer.jl b/src/iterators/EveryCanonicalKmer.jl new file mode 100644 index 0000000..7ddf297 --- /dev/null +++ b/src/iterators/EveryCanonicalKmer.jl @@ -0,0 +1,87 @@ +struct EveryCanonicalKmer{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} + it::EveryKmer{A, K, S} +end + +const SameEveryCanonicalKmer{A, K, S} = EveryCanonicalKmer{S, A, K} where {A, S <: BioSequence{A}} + +function EveryCanonicalKmer{K}(s) where K + S = typeof(s) + A = typeof(Alphabet(S)) + it = EveryKmer{S, A, K}(s) + EveryCanonicalKmer{S, A, K}(it) +end + +function EveryCanonicalKmer{A, K}(s::S) where {S <: BioSequence, A <: Alphabet, K} + EveryCanonicalKmer{S, A, K}(EveryKmer{A, K}(s)) +end + +function EveryCanonicalKmer{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} + s2 = codeunits(s) + EveryCanonicalKmer{typeof(s2), A, K}(s2) +end + +Base.IteratorSize(::Type{<:SameEveryCanonicalKmer}) = Base.HasLength() +Base.IteratorSize(::Type{<:EveryCanonicalKmer{<:BioSequence{<:TwoBit}, <:FourBit}}) = Base.HasLength() +Base.length(it::SameEveryCanonicalKmer) = length(it.it) + +# Generic iterator for the first element: I think we can do no better than to reverse-complement +# the entire kmer. However, for the following iterations, it's faster to add a single basepair to +# the RC'd kmer than to RC it from scratch, hence we need specialized methods for efficient RC'ing +# of individual bases. +function Base.iterate(it::EveryCanonicalKmer{S, A, K}) where {S, A, K} + itval = iterate(it.it) + itval === nothing && return nothing + fw = first(itval) + rv = reverse_complement(fw) + (min(fw, rv), (fw, rv, K+1)) +end + +# Generic fallback +function Base.iterate( + it::EveryCanonicalKmer{S, A, K}, + state::Tuple{Kmer, Kmer, Integer} +) where {S, A, K} + seq = it.it.seq + (fw, rv, i) = state + i > length(seq) && return nothing + symbol = @inbounds seq[i] + encoding = UInt(BioSequences.encode(A, symbol)) + rc_encoding = UInt(BioSequences.encode(A, complement(symbol))) + fw = shift_encoding(fw, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rv), (fw, rv, i+1)) +end + +# Special method for 2bit -> 2bit +function Base.iterate( + it::EveryCanonicalKmer{S, A, K}, + state::Tuple{Kmer, Kmer, Integer} +) where {K, A <: TwoBit, S <: BioSequence{A}} + seq = it.it.seq + (fw, rv, i) = state + i > length(seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(seq, i)) + rc_encoding = encoding ⊻ 0x3 + fw = shift_encoding(fw, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rv), (fw, rv, i+1)) +end + +# Special method for 2bit -> 4bit +function Base.iterate( + it::EveryCanonicalKmer{S, A, K}, + state::Tuple{Kmer, Kmer, Integer} +) where {K, A <: FourBit, S <: BioSequence{A}} + seq = it.it.seq + (fw, rv, i) = state + i > length(seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(seq, i)) + # Reverse-complementing like this is surprisingly inefficient. + # We may want to consider either using a 16-element LUT, or + # else simply changing the algorithm such that the whole kmer + # is reverse-complemented at every iteration + rc_encoding = reinterpret(UInt8, complement(reinterpret(DNA, encoding % UInt8))) % UInt + fw = shift_encoding(fw, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rv), (fw, rv, i+1)) +end diff --git a/src/iterators/EveryKmer.jl b/src/iterators/EveryKmer.jl index ab6f784..4b65e36 100644 --- a/src/iterators/EveryKmer.jl +++ b/src/iterators/EveryKmer.jl @@ -1,11 +1,13 @@ +# TODO: Lots of code sharing in this file... can we refactor to be more clever? + """ - EveryKmer{S, A <: Alphabet, K} + EveryKmer{A <: Alphabet, K, S} Iterator of every forward kmer. `S` signifies the type of the underlying sequence, and the eltype of the iterator is `Kmer{A, K, N}` with the appropriate `N`. -Can be constructed more conventiently with the constructors `EveryDNAMer{S, K}(s)` -and `EveryDNAMer{K}(s)`, and similar also for `EveryRNAMer` and `EveryAAMer`. +Can be constructed more conventiently with the constructors `EveryDNAMer{K}(s)` +and similar also for `EveryRNAMer` and `EveryAAMer`. If `A <: Union{DNAAlphabet{2}, RNAAlphabet{2}}` and `Alphabet(S) isa Union{DNAAlphabet{4}, RNAAlphabet{4}}`, the iterator skips all @@ -22,39 +24,92 @@ julia> length(collect(EveryRNAMer{3}(rna"UGDCUGAVC"))) 2 ``` """ -struct EveryKmer{S, A <: Alphabet, K} <: AbstractKmerIterator{A, K} +struct EveryKmer{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} seq::S + + function EveryKmer{A, K, S}(seq::S) where {A, K, S} + K isa Int || error("K must be an Int") + K > 0 || error("K must be at least 1") + new{A, K, S}(seq) + end end # Constructors -EveryKmer{A, K}(s) where {A <: Alphabet, K} = EveryKmer{typeof(s), A, K} -const EveryDNAMer{S, K} = EveryKmer{S, DNAAlphabet{2}, K} -const EveryRNAMer{S, K} = EveryKmer{S, RNAAlphabet{2}, K} -const EveryAAMer{S, K} = EveryKmer{S, AminoAcidAlphabet, K} +EveryKmer{A, K}(s) where {A <: Alphabet, K} = EveryKmer{A, K, typeof(s)} +const EveryDNAMer{K, S} = EveryKmer{DNAAlphabet{2}, K, S} +const EveryRNAMer{K, S} = EveryKmer{RNAAlphabet{2}, K, S} +const EveryAAMer{K, S} = EveryKmer{AminoAcidAlphabet, K, S} -EveryDNAMer{K}(s) where K = EveryDNAMer{typeof(s), K}(s) -EveryRNAMer{K}(s) where K = EveryRNAMer{typeof(s), K}(s) -EveryAAMer{K}(s) where K = EveryAAMer{typeof(s), K}(s) +EveryDNAMer{K}(s) where K = EveryDNAMer{K, typeof(s), }(s) +EveryRNAMer{K}(s) where K = EveryRNAMer{K, typeof(s)}(s) +EveryAAMer{K}(s) where K = EveryAAMer{K, typeof(s)}(s) -function EveryKmer{S, A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} +function EveryKmer{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} s2 = codeunits(s) - EveryKmer{typeof(s2), A, K}(s2) + EveryKmer{A, K, typeof(s2)}(s2) end -const SameEveryKmer{S, A, K} = EveryKmer{S, A} where {A, S <: BioSequence{A}} -const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} -const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} - # Known length if every symbol of the sequence can be represented in the kmer -Base.IteratorSize(::Type{<:SameEveryKmer}) = Base.HasLength() -Base.IteratorSize(::Type{<:EveryKmer{<:BioSequence{<:TwoBit}, <:FourBit}}) = Base.HasLength() +Base.IteratorSize(::Type{<:EveryKmer{A, K, <:BioSequence{A}}}) where {A <: Alphabet, K} = Base.HasLength() +Base.IteratorSize(::Type{<:EveryKmer{<:FourBit, K, <:BioSequence{<:TwoBit}}}) where K = Base.HasLength() -function Base.length(it::SameEveryKmer{S, A, K}) where {S, A, K} +function Base.length(it::EveryKmer{A, K, <:BioSequence{A}}) where {A <: Alphabet, K} length(it.seq) - K + 1 end -# These methods can carry the encoding directly over -function Base.iterate(it::EveryKmer{S, A, K}) where {A, K, S <: BioSequence{A}} +# Generic fallback +function Base.iterate(it::EveryKmer{A, K, S}) where {A <: Alphabet, K, S} + seq = it.seq + length(seq) < K && return nothing + data = zero_tuple(eltype(it)) + bps = BioSequences.bits_per_symbol(A()) + @inbounds for i in 1:K + symbol = seq[i] + encoding = UInt(BioSequences.encode(A(), convert(eltype(A), symbol))) + (_, data) = leftshift_carry(data, bps, encoding) + end + kmer = eltype(it)(unsafe, data) + (kmer, (kmer, K+1)) +end + +function Base.iterate(it::EveryKmer, state::Tuple{Kmer, Integer}) + seq = it.seq + (kmer, i) = state + i > length(seq) && return nothing + symbol = @inbounds seq[i] + new_kmer = shift(kmer, convert(eltype(A), symbol)) + (new_kmer, (new_kmer, i+1)) +end + +# These methods can carry the encoding directly over. We call into the internal method +# `iterate_copy`, because specifying the precise type constrains (either the same alphabet +# in the sequence and the iterator, OR both have either TwoBit or FourBit) +# is quite hard. +function Base.iterate(it::EveryKmer{A, K, <:BioSequence{A}, }) where {A <: Alphabet, K} + iterate_copy(it) +end + +function Base.iterate(it::EveryKmer{<:TwoBit, K, <:BioSequence{<:TwoBit}}) where K + iterate_copy(it) +end + +function Base.iterate(it::EveryKmer{<:FourBit, K, <:BioSequence{<:FourBit}}) where K + iterate_copy(it) +end + +function Base.iterate(it::EveryKmer{A, K, <:BioSequence{A}}, state::Tuple{Kmer, Integer}) where {A <: Alphabet, K} + iterate_copy(it, state) +end + +function Base.iterate(it::EveryKmer{<:TwoBit, K, <:BioSequence{<:TwoBit}}, state::Tuple{Kmer, Integer}) where K + iterate_copy(it, state) +end + +function Base.iterate(it::EveryKmer{<:FourBit, K, <:BioSequence{<:FourBit}}, state::Tuple{Kmer, Integer}) where K + iterate_copy(it, state) +end + +@inline function iterate_copy(it::EveryKmer{A, K, S}) where {A, K, S} seq = it.seq length(seq) < K && return nothing data = zero_tuple(eltype(it)) @@ -67,45 +122,42 @@ function Base.iterate(it::EveryKmer{S, A, K}) where {A, K, S <: BioSequence{A}} (kmer, (kmer, K+1)) end -function Base.iterate(it::EveryKmer{S, A, K}, state::Tuple{Kmer, Integer}) where {A, K, S <: BioSequence{A}} +@inline function iterate_copy(it::EveryKmer, state::Tuple{Kmer, Integer}) seq = it.seq (kmer, i) = state i > length(seq) && return nothing encoding = UInt(BioSequences.extract_encoded_element(seq, i)) - new_kmer = q_push_encoding(kmer, encoding) + new_kmer = shift_encoding(kmer, encoding) (new_kmer, (new_kmer, i+1)) end # These methods can use special 2 -> 4 bit recoding -@inline recode(encoding::UInt) = left_shift(UInt(1), encoding) - -function Base.iterate(it::EveryKmer{S, <:FourBit, K}) where {S <: BioSequence{<:TwoBit}, K} +function Base.iterate(it::EveryKmer{<:FourBit, K, S}) where {S <: BioSequence{<:TwoBit}, K} seq = it.seq length(seq) < K && return nothing data = zero_tuple(eltype(it)) for i in 1:K - encoding = recode(UInt(BioSequences.extract_encoded_element(seq, i))) + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) (_, data) = leftshift_carry(data, 4, encoding) end kmer = eltype(it)(unsafe, data) (kmer, (kmer, K+1)) end -# TODO: Lots of code sharing in this file... can we refactor to be more clever? -function Base.iterate(it::EveryKmer{S, <:FourBit}, state::Tuple{Kmer, Integer}) where {S <: BioSequence{<:TwoBit}} +function Base.iterate(it::EveryKmer{<:FourBit, K, S}, state::Tuple{Kmer, Integer}) where {K, S <: BioSequence{<:TwoBit}} seq = it.seq (kmer, i) = state i > length(seq) && return nothing - encoding = recode(UInt(BioSequences.extract_encoded_element(seq, i))) - new_kmer = q_push_encoding(kmer, encoding) + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) + new_kmer = shift_encoding(kmer, encoding) (new_kmer, (new_kmer, i+1)) end # This is special because, by convention, we skip every ambiguous kmer # instead of erroring. function Base.iterate( - it::EveryKmer{S, A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) -) where {A <: TwoBit, S <: BioSequence{<:FourBit}, K} + it::EveryKmer{A, K, S}, state=(zero_kmer(Kmer{A, K}), K, 1) +) where {A <: TwoBit, K, S <: BioSequence{<:FourBit}} (kmer, remaining, i) = state seq = it.seq while !iszero(remaining) @@ -115,29 +167,14 @@ function Base.iterate( i += 1 # TODO: Is lookup table faster? remaining = ifelse(isone(count_ones(encoding)), remaining - 1, K) - kmer = q_push_encoding(kmer, trailing_zeros(encoding) % UInt) + kmer = shift_encoding(kmer, trailing_zeros(encoding) % UInt) end return (kmer, (kmer, 1, i)) end -const BYTE_LUT = let - v = fill(0xff, 256) - for (i, s) in [(0, "Aa"), (1, "cC"), (2, "gG"), (3, "TtUu")], c in s - v[UInt8(c) + 1] = i - end - for c in "-MRSVWYHKDBN" - v[UInt8(c) + 1] = 0xf0 - v[UInt8(lowercase(c)) + 1] = 0xf0 - end - Tuple(v) -end - -# TODO: Change to lazy_str when new Julia LTS drops after 1.6 -@noinline throw_bad_byte_error(b::UInt8) = error("Cannot interpret byte $(repr(b)) as nucleotide") - function Base.iterate( - it::EveryKmer{S, A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) -) where {A <: TwoBit, S <: AbstractVector{UInt8}, K} + it::EveryKmer{A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) +) where {A <: TwoBit, K} (kmer, remaining, i) = state seq = it.seq Base.require_one_based_indexing(seq) @@ -148,7 +185,7 @@ function Base.iterate( encoding = @inbounds BYTE_LUT[byte + 0x01] encoding == 0xff && throw_bad_byte_error(byte) remaining = ifelse(encoding == 0xf0, K, remaining - 1) - kmer = q_push_encoding(kmer, encoding % UInt) + kmer = shift_encoding(kmer, encoding % UInt) end return (kmer, (kmer, 1, i)) -end \ No newline at end of file +end diff --git a/src/iterators/common.jl b/src/iterators/common.jl index 9d8bbae..c0198df 100644 --- a/src/iterators/common.jl +++ b/src/iterators/common.jl @@ -4,4 +4,21 @@ function Base.eltype(::Type{<:AbstractKmerIterator{A, K}}) where {A, K} Kmer{A, K, n_coding_elements(Kmer{A, K})} end -Base.IteratorSize(::Type{<:AbstractKmerIterator}) = Base.SizeUnknown() \ No newline at end of file +Base.IteratorSize(::Type{<:AbstractKmerIterator}) = Base.SizeUnknown() + +const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} +const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} + +@noinline throw_bad_byte_error(b::UInt8) = error("Cannot interpret byte $(repr(b)) as nucleotide") + +const BYTE_LUT = let + v = fill(0xff, 256) + for (i, s) in [(0, "Aa"), (1, "cC"), (2, "gG"), (3, "TtUu")], c in s + v[UInt8(c) + 1] = i + end + for c in "-MRSVWYHKDBN" + v[UInt8(c) + 1] = 0xf0 + v[UInt8(lowercase(c)) + 1] = 0xf0 + end + Tuple(v) +end \ No newline at end of file diff --git a/src/kmer.jl b/src/kmer.jl index 6c5f900..28a383f 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -94,7 +94,7 @@ end @inline ksize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = K @inline nsize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = N @inline n_unused(::Type{<:Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K -@inline bits_unused(T::Type{<:Kmer})= n_unused(T) * BioSequences.bits_per_symbol(T) +@inline bits_unused(T::Type{<:Kmer}) = n_unused(T) * BioSequences.bits_per_symbol(T) @inline BioSequences.Alphabet(::Kmer{A}) where A = A() @@ -332,27 +332,27 @@ function push(kmer::Kmer, s) end """ - q_push(kmer::kmer, symbol)::typeof(kmer) +shift(kmer::kmer, symbol)::typeof(kmer) Push `symbol` onto the end of `kmer`, and pop the first symbol in `kmer`. # Examples ```jldoctest -julia> q_push(mer"TACC"d, DNA_A) +julia> shift(mer"TACC"d, DNA_A) DNA 4-mer ACCA -julia> q_push(mer"WKYMLPIIRS"aa, AA_F) +julia> shift(mer"WKYMLPIIRS"aa, AA_F) AminoAcid 10-mer KYMLPIIRSF ``` """ -function q_push(kmer::Kmer{A}, s) where A +function shift(kmer::Kmer{A}, s) where A encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) - q_push_encoding(kmer, encoding) + shift_encoding(kmer, encoding) end -@inline function q_push_encoding(kmer::Kmer, encoding::UInt) +@inline function shift_encoding(kmer::Kmer, encoding::UInt) bps = BioSequences.bits_per_symbol(kmer) (_, new_data) = leftshift_carry(kmer.data, bps, encoding) (head, tail...) = new_data @@ -375,25 +375,29 @@ function pushfirst(kmer::Kmer{A}, s) where A end """ - q_pushfirst(kmer::kmer, symbol)::typeof(kmer) + shift_first(kmer::kmer, symbol)::typeof(kmer) Push `symbol` onto the start of `kmer`, and pop the last symbol in `kmer`. # Examples ```jldoctest -julia> q_pushfirst(mer"TACC"d, DNA_A) +julia> shift_first(mer"TACC"d, DNA_A) DNA 4-mer ATAC -julia> q_pushfirst(mer"WKYMLPIIRS"aa, AA_F) +julia> shift_first(mer"WKYMLPIIRS"aa, AA_F) AminoAcid 10-mer FWKYMLPIIR ``` """ -function q_pushfirst(kmer::Kmer{A}, s) where A - bps = BioSequences.bits_per_symbol(A()) +function shift_first(kmer::Kmer{A}, s) where A encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) - (_, new_data) = rightshift_carry(kmer.data, bps, encoding) + shift_first_encoding(kmer, encoding) +end + +function shift_first_encoding(kmer::Kmer{A}, encoding::UInt) where A + bps = BioSequences.bits_per_symbol(A()) + (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) (head, tail...) = new_data head |= left_shift(encoding, (elements_in_head(typeof(kmer)) - 1) * bps) typeof(kmer)(unsafe, (head, tail...)) @@ -421,4 +425,4 @@ end # in the UInt tuple @inline function get_mask(T::Type{<:Kmer}) UInt(1) << (8*sizeof(UInt) - bits_unused(T)) - 1 -end \ No newline at end of file +end diff --git a/src/transformations.jl b/src/transformations.jl index 89e8893..d658b12 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -42,7 +42,7 @@ end BioSequences.iscanonical(x::Kmer) = x <= reverse_complement(x) -function translate( +function BioSequences.translate( seq::Kmer{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}; code::BioSequences.GeneticCode = BioSequences.standard_genetic_code, allow_ambiguous_codons::Bool = true, # noop in this method @@ -70,7 +70,7 @@ function translate( end end -function translate( +function BioSequences.translate( seq::Kmer{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}}; code::BioSequences.GeneticCode = BioSequences.standard_genetic_code, allow_ambiguous_codons::Bool = true, # noop in this method @@ -101,4 +101,4 @@ function translate( else return result end -end \ No newline at end of file +end From 6c1535e954a353ba1a185f534c5df55f4b7484b5 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 26 Sep 2023 13:00:25 +0200 Subject: [PATCH 12/33] Rename: EveryKmer to FwKmers --- src/Kmers.jl | 10 +-- ...ryCanonicalKmer.jl => FwCanonicalKmers.jl} | 34 +++++----- src/iterators/{EveryKmer.jl => FwKmers.jl} | 64 +++++++++---------- 3 files changed, 54 insertions(+), 54 deletions(-) rename src/iterators/{EveryCanonicalKmer.jl => FwCanonicalKmers.jl} (68%) rename src/iterators/{EveryKmer.jl => FwKmers.jl} (63%) diff --git a/src/Kmers.jl b/src/Kmers.jl index 402b677..d74bf21 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -30,10 +30,10 @@ export shift_first, # Iterators - EveryKmer, - EveryDNAMer, - EveryRNAMer, - EveryAAMer, + FwKmers, + FwDNAMers, + FwRNAMers, + FwAAMers, ################## # Re-exports @@ -139,7 +139,7 @@ include("transformations.jl") include("revtrans.jl") include("iterators/common.jl") -include("iterators/EveryKmer.jl") +include("iterators/FwKmers.jl") include("iterators/EveryCanonicalKmer.jl") end # module diff --git a/src/iterators/EveryCanonicalKmer.jl b/src/iterators/FwCanonicalKmers.jl similarity index 68% rename from src/iterators/EveryCanonicalKmer.jl rename to src/iterators/FwCanonicalKmers.jl index 7ddf297..e94bc60 100644 --- a/src/iterators/EveryCanonicalKmer.jl +++ b/src/iterators/FwCanonicalKmers.jl @@ -1,34 +1,34 @@ -struct EveryCanonicalKmer{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} - it::EveryKmer{A, K, S} +struct FwCanonicalKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} + it::FwKmers{A, K, S} end -const SameEveryCanonicalKmer{A, K, S} = EveryCanonicalKmer{S, A, K} where {A, S <: BioSequence{A}} +const SameFwCanonicalKmers{A, K, S} = FwCanonicalKmers{S, A, K} where {A, S <: BioSequence{A}} -function EveryCanonicalKmer{K}(s) where K +function FwCanonicalKmers{K}(s) where K S = typeof(s) A = typeof(Alphabet(S)) - it = EveryKmer{S, A, K}(s) - EveryCanonicalKmer{S, A, K}(it) + it = FwKmers{S, A, K}(s) + FwCanonicalKmers{S, A, K}(it) end -function EveryCanonicalKmer{A, K}(s::S) where {S <: BioSequence, A <: Alphabet, K} - EveryCanonicalKmer{S, A, K}(EveryKmer{A, K}(s)) +function FwCanonicalKmers{A, K}(s::S) where {S <: BioSequence, A <: Alphabet, K} + FwCanonicalKmers{S, A, K}(FwKmers{A, K}(s)) end -function EveryCanonicalKmer{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} +function FwCanonicalKmers{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} s2 = codeunits(s) - EveryCanonicalKmer{typeof(s2), A, K}(s2) + FwCanonicalKmers{typeof(s2), A, K}(s2) end -Base.IteratorSize(::Type{<:SameEveryCanonicalKmer}) = Base.HasLength() -Base.IteratorSize(::Type{<:EveryCanonicalKmer{<:BioSequence{<:TwoBit}, <:FourBit}}) = Base.HasLength() -Base.length(it::SameEveryCanonicalKmer) = length(it.it) +Base.IteratorSize(::Type{<:SameFwCanonicalKmers}) = Base.HasLength() +Base.IteratorSize(::Type{<:FwCanonicalKmers{<:BioSequence{<:TwoBit}, <:FourBit}}) = Base.HasLength() +Base.length(it::SameFwCanonicalKmers) = length(it.it) # Generic iterator for the first element: I think we can do no better than to reverse-complement # the entire kmer. However, for the following iterations, it's faster to add a single basepair to # the RC'd kmer than to RC it from scratch, hence we need specialized methods for efficient RC'ing # of individual bases. -function Base.iterate(it::EveryCanonicalKmer{S, A, K}) where {S, A, K} +function Base.iterate(it::FwCanonicalKmers{S, A, K}) where {S, A, K} itval = iterate(it.it) itval === nothing && return nothing fw = first(itval) @@ -38,7 +38,7 @@ end # Generic fallback function Base.iterate( - it::EveryCanonicalKmer{S, A, K}, + it::FwCanonicalKmers{S, A, K}, state::Tuple{Kmer, Kmer, Integer} ) where {S, A, K} seq = it.it.seq @@ -54,7 +54,7 @@ end # Special method for 2bit -> 2bit function Base.iterate( - it::EveryCanonicalKmer{S, A, K}, + it::FwCanonicalKmers{S, A, K}, state::Tuple{Kmer, Kmer, Integer} ) where {K, A <: TwoBit, S <: BioSequence{A}} seq = it.it.seq @@ -69,7 +69,7 @@ end # Special method for 2bit -> 4bit function Base.iterate( - it::EveryCanonicalKmer{S, A, K}, + it::FwCanonicalKmers{S, A, K}, state::Tuple{Kmer, Kmer, Integer} ) where {K, A <: FourBit, S <: BioSequence{A}} seq = it.it.seq diff --git a/src/iterators/EveryKmer.jl b/src/iterators/FwKmers.jl similarity index 63% rename from src/iterators/EveryKmer.jl rename to src/iterators/FwKmers.jl index 4b65e36..4ac8594 100644 --- a/src/iterators/EveryKmer.jl +++ b/src/iterators/FwKmers.jl @@ -1,13 +1,13 @@ # TODO: Lots of code sharing in this file... can we refactor to be more clever? """ - EveryKmer{A <: Alphabet, K, S} + FwKmers{A <: Alphabet, K, S} -Iterator of every forward kmer. `S` signifies the type of the underlying sequence, +Iterator of forward kmers. `S` signifies the type of the underlying sequence, and the eltype of the iterator is `Kmer{A, K, N}` with the appropriate `N`. -Can be constructed more conventiently with the constructors `EveryDNAMer{K}(s)` -and similar also for `EveryRNAMer` and `EveryAAMer`. +Can be constructed more conventiently with the constructors `FwDNAMers{K}(s)` +and similar also for `FwRNAMers` and `FwAAMers`. If `A <: Union{DNAAlphabet{2}, RNAAlphabet{2}}` and `Alphabet(S) isa Union{DNAAlphabet{4}, RNAAlphabet{4}}`, the iterator skips all @@ -24,10 +24,10 @@ julia> length(collect(EveryRNAMer{3}(rna"UGDCUGAVC"))) 2 ``` """ -struct EveryKmer{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} +struct FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} seq::S - function EveryKmer{A, K, S}(seq::S) where {A, K, S} + function FwKmers{A, K, S}(seq::S) where {A, K, S} K isa Int || error("K must be an Int") K > 0 || error("K must be at least 1") new{A, K, S}(seq) @@ -35,30 +35,30 @@ struct EveryKmer{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} end # Constructors -EveryKmer{A, K}(s) where {A <: Alphabet, K} = EveryKmer{A, K, typeof(s)} -const EveryDNAMer{K, S} = EveryKmer{DNAAlphabet{2}, K, S} -const EveryRNAMer{K, S} = EveryKmer{RNAAlphabet{2}, K, S} -const EveryAAMer{K, S} = EveryKmer{AminoAcidAlphabet, K, S} +FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)} +const FwDNAMers{K, S} = FwKmers{DNAAlphabet{2}, K, S} +const FwRNAMers{K, S} = FwKmers{RNAAlphabet{2}, K, S} +const FwAAMers{K, S} = FwKmers{AminoAcidAlphabet, K, S} -EveryDNAMer{K}(s) where K = EveryDNAMer{K, typeof(s), }(s) -EveryRNAMer{K}(s) where K = EveryRNAMer{K, typeof(s)}(s) -EveryAAMer{K}(s) where K = EveryAAMer{K, typeof(s)}(s) +FwDNAMers{K}(s) where K = FwDNAMers{K, typeof(s), }(s) +FwRNAMers{K}(s) where K = FwRNAMers{K, typeof(s)}(s) +FwAAMers{K}(s) where K = FwAAMers{K, typeof(s)}(s) -function EveryKmer{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} +function FwKmers{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} s2 = codeunits(s) - EveryKmer{A, K, typeof(s2)}(s2) + FwKmers{A, K, typeof(s2)}(s2) end # Known length if every symbol of the sequence can be represented in the kmer -Base.IteratorSize(::Type{<:EveryKmer{A, K, <:BioSequence{A}}}) where {A <: Alphabet, K} = Base.HasLength() -Base.IteratorSize(::Type{<:EveryKmer{<:FourBit, K, <:BioSequence{<:TwoBit}}}) where K = Base.HasLength() +Base.IteratorSize(::Type{<:FwKmers{A, K, <:BioSequence{A}}}) where {A <: Alphabet, K} = Base.HasLength() +Base.IteratorSize(::Type{<:FwKmers{<:FourBit, K, <:BioSequence{<:TwoBit}}}) where K = Base.HasLength() -function Base.length(it::EveryKmer{A, K, <:BioSequence{A}}) where {A <: Alphabet, K} +function Base.length(it::FwKmers{A, K, <:BioSequence{A}}) where {A <: Alphabet, K} length(it.seq) - K + 1 end # Generic fallback -function Base.iterate(it::EveryKmer{A, K, S}) where {A <: Alphabet, K, S} +function Base.iterate(it::FwKmers{A, K, S}) where {A <: Alphabet, K, S} seq = it.seq length(seq) < K && return nothing data = zero_tuple(eltype(it)) @@ -72,7 +72,7 @@ function Base.iterate(it::EveryKmer{A, K, S}) where {A <: Alphabet, K, S} (kmer, (kmer, K+1)) end -function Base.iterate(it::EveryKmer, state::Tuple{Kmer, Integer}) +function Base.iterate(it::FwKmers, state::Tuple{Kmer, Integer}) seq = it.seq (kmer, i) = state i > length(seq) && return nothing @@ -85,31 +85,31 @@ end # `iterate_copy`, because specifying the precise type constrains (either the same alphabet # in the sequence and the iterator, OR both have either TwoBit or FourBit) # is quite hard. -function Base.iterate(it::EveryKmer{A, K, <:BioSequence{A}, }) where {A <: Alphabet, K} +function Base.iterate(it::FwKmers{A, K, <:BioSequence{A}, }) where {A <: Alphabet, K} iterate_copy(it) end -function Base.iterate(it::EveryKmer{<:TwoBit, K, <:BioSequence{<:TwoBit}}) where K +function Base.iterate(it::FwKmers{<:TwoBit, K, <:BioSequence{<:TwoBit}}) where K iterate_copy(it) end -function Base.iterate(it::EveryKmer{<:FourBit, K, <:BioSequence{<:FourBit}}) where K +function Base.iterate(it::FwKmers{<:FourBit, K, <:BioSequence{<:FourBit}}) where K iterate_copy(it) end -function Base.iterate(it::EveryKmer{A, K, <:BioSequence{A}}, state::Tuple{Kmer, Integer}) where {A <: Alphabet, K} +function Base.iterate(it::FwKmers{A, K, <:BioSequence{A}}, state::Tuple{Kmer, Integer}) where {A <: Alphabet, K} iterate_copy(it, state) end -function Base.iterate(it::EveryKmer{<:TwoBit, K, <:BioSequence{<:TwoBit}}, state::Tuple{Kmer, Integer}) where K +function Base.iterate(it::FwKmers{<:TwoBit, K, <:BioSequence{<:TwoBit}}, state::Tuple{Kmer, Integer}) where K iterate_copy(it, state) end -function Base.iterate(it::EveryKmer{<:FourBit, K, <:BioSequence{<:FourBit}}, state::Tuple{Kmer, Integer}) where K +function Base.iterate(it::FwKmers{<:FourBit, K, <:BioSequence{<:FourBit}}, state::Tuple{Kmer, Integer}) where K iterate_copy(it, state) end -@inline function iterate_copy(it::EveryKmer{A, K, S}) where {A, K, S} +@inline function iterate_copy(it::FwKmers{A, K, S}) where {A, K, S} seq = it.seq length(seq) < K && return nothing data = zero_tuple(eltype(it)) @@ -122,7 +122,7 @@ end (kmer, (kmer, K+1)) end -@inline function iterate_copy(it::EveryKmer, state::Tuple{Kmer, Integer}) +@inline function iterate_copy(it::FwKmers, state::Tuple{Kmer, Integer}) seq = it.seq (kmer, i) = state i > length(seq) && return nothing @@ -132,7 +132,7 @@ end end # These methods can use special 2 -> 4 bit recoding -function Base.iterate(it::EveryKmer{<:FourBit, K, S}) where {S <: BioSequence{<:TwoBit}, K} +function Base.iterate(it::FwKmers{<:FourBit, K, S}) where {S <: BioSequence{<:TwoBit}, K} seq = it.seq length(seq) < K && return nothing data = zero_tuple(eltype(it)) @@ -144,7 +144,7 @@ function Base.iterate(it::EveryKmer{<:FourBit, K, S}) where {S <: BioSequence{<: (kmer, (kmer, K+1)) end -function Base.iterate(it::EveryKmer{<:FourBit, K, S}, state::Tuple{Kmer, Integer}) where {K, S <: BioSequence{<:TwoBit}} +function Base.iterate(it::FwKmers{<:FourBit, K, S}, state::Tuple{Kmer, Integer}) where {K, S <: BioSequence{<:TwoBit}} seq = it.seq (kmer, i) = state i > length(seq) && return nothing @@ -156,7 +156,7 @@ end # This is special because, by convention, we skip every ambiguous kmer # instead of erroring. function Base.iterate( - it::EveryKmer{A, K, S}, state=(zero_kmer(Kmer{A, K}), K, 1) + it::FwKmers{A, K, S}, state=(zero_kmer(Kmer{A, K}), K, 1) ) where {A <: TwoBit, K, S <: BioSequence{<:FourBit}} (kmer, remaining, i) = state seq = it.seq @@ -173,7 +173,7 @@ function Base.iterate( end function Base.iterate( - it::EveryKmer{A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) + it::FwKmers{A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) ) where {A <: TwoBit, K} (kmer, remaining, i) = state seq = it.seq From ba774f165bcec7ded41dba0e6b0620d134351e54 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 30 Sep 2023 15:43:42 +0200 Subject: [PATCH 13/33] Start SpacedKmers --- src/Kmers.jl | 2 +- src/iterators/FwCanonicalKmers.jl | 2 +- src/iterators/SpacedKmers.jl | 69 +++++++++++++++++++++++++++++++ src/kmer.jl | 13 +----- 4 files changed, 72 insertions(+), 14 deletions(-) create mode 100644 src/iterators/SpacedKmers.jl diff --git a/src/Kmers.jl b/src/Kmers.jl index d74bf21..8f37b18 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -140,6 +140,6 @@ include("revtrans.jl") include("iterators/common.jl") include("iterators/FwKmers.jl") -include("iterators/EveryCanonicalKmer.jl") +include("iterators/FwCanonicalKmers.jl") end # module diff --git a/src/iterators/FwCanonicalKmers.jl b/src/iterators/FwCanonicalKmers.jl index e94bc60..54ccfde 100644 --- a/src/iterators/FwCanonicalKmers.jl +++ b/src/iterators/FwCanonicalKmers.jl @@ -4,7 +4,7 @@ end const SameFwCanonicalKmers{A, K, S} = FwCanonicalKmers{S, A, K} where {A, S <: BioSequence{A}} -function FwCanonicalKmers{K}(s) where K +function FwCanonicalKmers{K}(s::BioSequence) where K S = typeof(s) A = typeof(Alphabet(S)) it = FwKmers{S, A, K}(s) diff --git a/src/iterators/SpacedKmers.jl b/src/iterators/SpacedKmers.jl new file mode 100644 index 0000000..72c47a2 --- /dev/null +++ b/src/iterators/SpacedKmers.jl @@ -0,0 +1,69 @@ + +""" + SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} + +An iterator over every valid `T<:Kmer` separated by a `step` parameter, in a given +longer `BioSequence`, between a `start` and `stop` position. + +!!! note + Typically, the alphabet of the Kmer type matches the alphabet of the input + BioSequence. In these cases, the iterator will have `Base.IteratorSize` of + `Base.HasLength`, and successive kmers produced by the iterator will overlap + by `max(0, K - step)` bases. + + However, in the specific case of iterating over kmers in a DNA or RNA sequence, you + may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but + the input BioSequence has a NucleicAcidAlphabet{4}. + + In this case then the iterator will skip over positions in the BioSequence + with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. + + As a result, the overlap between successive kmers may not consistent, but the + reading frame will be preserved. + In addition, the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. +""" +struct SpacedKmers{A <: Alphabet, K, St, S} <: AbstractKmerIterator{A, K} + seq::S + + function SpacedKmer{A, K, St, S}(seq) where {A, K, St, S} + K isa Int || K > 0 || error("K must be an Int > 0") + St isa Int || St > 0 || error("St must be an Int > 0") + new{A, K, St, S}(seq) + end +end + +# Constructors + +# Iterators +function Base.iterate( + it::SpacedKmers{A, K, St, <:BioSequence{A}}, + state=(1, zero_tuple(eltype(it)), K) +) where {A, K, St} + iterate_copy(it, state) +end + +# Just copy the encoding straight over +@inline function iterate_copy(it::SpacedKmers{A, K, St}, state::Tuple{Int, <:Tuple{Vararg{UInt}}, Int}) where {A, K, St} + (index, data, remaining) = state + original_index = index + seq = it.seq + len = length(seq) + bps = BioSequences.bits_per_symbol(A()) + # TODO: Don't double check remaining and index + while !iszero(remaining) && index ≤ len + encoding = UInt(BioSequences.extract_encoded_element(seq, index)) + (_, data) = leftshift_carry(data, bps, encoding) + end + return if iszero(remaining) + kmer = eltype(it)(unsafe, data) + remaining = min(K, St) + state = ( + original_index + St, + # data, left shift carry clear elements + remaining + ) + (kmer, state) + else + nothing + end +end \ No newline at end of file diff --git a/src/kmer.jl b/src/kmer.jl index 28a383f..60a87ec 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -91,7 +91,7 @@ end # Compile-time functions computed on Kmer types ################################################ -@inline ksize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = K +@inline ksize(::Type{<:Kmer{A, K}}) where {A, K} = K @inline nsize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = N @inline n_unused(::Type{<:Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K @inline bits_unused(T::Type{<:Kmer}) = n_unused(T) * BioSequences.bits_per_symbol(T) @@ -305,15 +305,8 @@ Base.cmp(x::T, y::T) where {T <: Kmer} = cmp(x.data, y.data) Base.:(==)(x::Kmer{A}, y::Kmer{A}) where A = x.data == y.data Base.isless(x::T, y::T) where {T <: Kmer} = isless(x.data, y.data) -# TODO: We need to figure out what to do with hashing first. -# Per the contract of isequal, isequal(a, b) == (hash(a) == hash(b)). -# Further, it's imperative that hashing kmers is absolutely optimal. -# So, what to do? Base.isequal(x::Kmer, y::BioSequence) = false Base.isequal(x::BioSequence, y::Kmer) = false - -# TODO: Ensure this is the right way to go. -# See https://github.com/BioJulia/BioSequences.jl/pull/121#discussion_r475234270 Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) function push(kmer::Kmer, s) @@ -417,10 +410,6 @@ function pop(kmer::Kmer{A}) where A newT(unsafe, new_data) end -################################################# -# Various bit-twiddling useful functions on kmers -################################################# - # Get a mask 0x0001111 ... masking away the unused bits of the head element # in the UInt tuple @inline function get_mask(T::Type{<:Kmer}) From 6584e2e5a2fae4a2c73c1d60a9a3b71742854ce2 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 2 Oct 2023 10:00:06 +0200 Subject: [PATCH 14/33] Some refactoring --- src/Kmers.jl | 1 + src/iterators/FwCanonicalKmers.jl | 4 +- src/iterators/FwKmers.jl | 11 ++-- src/iterators/SpacedKmers.jl | 98 ++++++++++++++++++++++--------- src/iterators/common.jl | 30 +++++++++- src/transformations.jl | 4 +- 6 files changed, 109 insertions(+), 39 deletions(-) diff --git a/src/Kmers.jl b/src/Kmers.jl index 8f37b18..6011f0a 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -141,5 +141,6 @@ include("revtrans.jl") include("iterators/common.jl") include("iterators/FwKmers.jl") include("iterators/FwCanonicalKmers.jl") +include("iterators/SpacedKmers.jl") end # module diff --git a/src/iterators/FwCanonicalKmers.jl b/src/iterators/FwCanonicalKmers.jl index 54ccfde..80c60ad 100644 --- a/src/iterators/FwCanonicalKmers.jl +++ b/src/iterators/FwCanonicalKmers.jl @@ -4,6 +4,8 @@ end const SameFwCanonicalKmers{A, K, S} = FwCanonicalKmers{S, A, K} where {A, S <: BioSequence{A}} +source_type(::Type{FwCanonicalKmers{A, K, S}}) where {A, K, S} = S + function FwCanonicalKmers{K}(s::BioSequence) where K S = typeof(s) A = typeof(Alphabet(S)) @@ -20,8 +22,6 @@ function FwCanonicalKmers{A, K}(s::S) where {S <: Union{String, SubString{String FwCanonicalKmers{typeof(s2), A, K}(s2) end -Base.IteratorSize(::Type{<:SameFwCanonicalKmers}) = Base.HasLength() -Base.IteratorSize(::Type{<:FwCanonicalKmers{<:BioSequence{<:TwoBit}, <:FourBit}}) = Base.HasLength() Base.length(it::SameFwCanonicalKmers) = length(it.it) # Generic iterator for the first element: I think we can do no better than to reverse-complement diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index 4ac8594..44bd0cf 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -34,6 +34,8 @@ struct FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} end end +source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S + # Constructors FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)} const FwDNAMers{K, S} = FwKmers{DNAAlphabet{2}, K, S} @@ -49,12 +51,9 @@ function FwKmers{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: FwKmers{A, K, typeof(s2)}(s2) end -# Known length if every symbol of the sequence can be represented in the kmer -Base.IteratorSize(::Type{<:FwKmers{A, K, <:BioSequence{A}}}) where {A <: Alphabet, K} = Base.HasLength() -Base.IteratorSize(::Type{<:FwKmers{<:FourBit, K, <:BioSequence{<:TwoBit}}}) where K = Base.HasLength() - -function Base.length(it::FwKmers{A, K, <:BioSequence{A}}) where {A <: Alphabet, K} - length(it.seq) - K + 1 +function Base.length(it::FwKmers) + Base.IteratorSize(typeof(it)) == Base.HasLength() || throw(MethodError(length, (it,))) + length(it.seq) - ksize(eltype(it)) + 1 end # Generic fallback diff --git a/src/iterators/SpacedKmers.jl b/src/iterators/SpacedKmers.jl index 72c47a2..ffa96ef 100644 --- a/src/iterators/SpacedKmers.jl +++ b/src/iterators/SpacedKmers.jl @@ -1,3 +1,4 @@ +# TODO: Can the FW kmers simply be an alias for SpacedKmers with step = 1 """ SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} @@ -25,45 +26,88 @@ longer `BioSequence`, between a `start` and `stop` position. struct SpacedKmers{A <: Alphabet, K, St, S} <: AbstractKmerIterator{A, K} seq::S - function SpacedKmer{A, K, St, S}(seq) where {A, K, St, S} - K isa Int || K > 0 || error("K must be an Int > 0") - St isa Int || St > 0 || error("St must be an Int > 0") + function SpacedKmers{A, K, St, S}(seq) where {A, K, St, S} + (K isa Int && K > 0) || error("K must be an Int > 0") + (St isa Int && St > 0) || error("St must be an Int > 0") new{A, K, St, S}(seq) end end +source_type(::Type{SpacedKmers{A, K, St, S}}) where {A, K, St, S} = S + +function Base.length(it::SpacedKmers{A, K, St}) where {A, K, St} + Base.IteratorSize(typeof(it)) == Base.HasLength() || throw(MethodError(length, (it,))) + available_starting_positions = length(it.seq) - ksize(eltype(it)) + 1 + div(available_starting_positions, St) +end + # Constructors +SpacedKmers{A, K, St}(seq) where {A, K, St} = SpacedKmers{A, K, St, typeof(seq)}(seq) # Iterators -function Base.iterate( - it::SpacedKmers{A, K, St, <:BioSequence{A}}, - state=(1, zero_tuple(eltype(it)), K) -) where {A, K, St} - iterate_copy(it, state) +function Base.iterate(it::SpacedKmers{A, K, St, <:BioSequence{A}}) where {A, K, St} + if St ≥ K + iterate_copy_nomask(it, 1) + else + x = iterate_copy_nomask(it, 1) + x === nothing && return nothing + (kmer, _) = x + return (kmer, (K + 1, kmer.data)) + end end -# Just copy the encoding straight over -@inline function iterate_copy(it::SpacedKmers{A, K, St}, state::Tuple{Int, <:Tuple{Vararg{UInt}}, Int}) where {A, K, St} - (index, data, remaining) = state - original_index = index +function Base.iterate(it::SpacedKmers{A, K, St, <:BioSequence{A}}, state) where {A, K, St} + if St ≥ K + iterate_copy_nomask(it, state) + else + iterate_copy_mask(it, state) + end +end + +# Called when St ≥ K, and the encoding in seq matches that of the kmer. +# We can build the kmer from scratch at every iteration, simplifying the code +@inline function iterate_copy_nomask(it::SpacedKmers{A, K, St}, state::Int) where {A, K, St} seq = it.seq len = length(seq) bps = BioSequences.bits_per_symbol(A()) - # TODO: Don't double check remaining and index - while !iszero(remaining) && index ≤ len - encoding = UInt(BioSequences.extract_encoded_element(seq, index)) + remaining = K + data = zero_tuple(eltype(it)) + while true + state > len && return nothing + encoding = UInt(BioSequences.extract_encoded_element(seq, state)) (_, data) = leftshift_carry(data, bps, encoding) + state += 1 + remaining -= 1 + iszero(remaining) && return (eltype(it)(unsafe, data), state + max(0, St-K)) end - return if iszero(remaining) - kmer = eltype(it)(unsafe, data) - remaining = min(K, St) - state = ( - original_index + St, - # data, left shift carry clear elements - remaining - ) - (kmer, state) - else - nothing +end + +# Called when St < K, and the encoding in seq matches that of the kmer. +# We can copy the encoding right over, and we need to preserve some data in the kmer +# between iterations +@inline function iterate_copy_mask(it::SpacedKmers{A, K, St}, state::Tuple{Int, Tuple{Vararg{UInt}}}) where {A, K, St} + seq = it.seq + len = length(seq) + bps = BioSequences.bits_per_symbol(A()) + remaining = St + (index, data) = state + while true + index > len && return nothing + encoding = UInt(BioSequences.extract_encoded_element(seq, index)) + (_, data) = leftshift_carry(data, bps, encoding) + index += 1 + remaining -= 1 + if iszero(remaining) + # Mask out unused bits before we return the kmer. + (head, rest...) = data + kmer = eltype(it)(unsafe, (head & get_mask(eltype(it)), rest...)) + return (kmer, (index, data)) + end end -end \ No newline at end of file +end + +# TODO: Methods: +# 4 -> 2 bit +# 2 -> 4 bit? +# Byte sequence: 2 bit +# Byte sequence: other alphabets \ No newline at end of file diff --git a/src/iterators/common.jl b/src/iterators/common.jl index c0198df..69f0518 100644 --- a/src/iterators/common.jl +++ b/src/iterators/common.jl @@ -4,10 +4,36 @@ function Base.eltype(::Type{<:AbstractKmerIterator{A, K}}) where {A, K} Kmer{A, K, n_coding_elements(Kmer{A, K})} end -Base.IteratorSize(::Type{<:AbstractKmerIterator}) = Base.SizeUnknown() - const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} +const ByteSource = Union{String, SubString{String}, AbstractVector{UInt8}} + +""" + source_type(::Type{<:AbstractKmerIterator})::Type + +Get the type of the data source that kmers are extracted from +""" +function source_type end + +# We can't compute the length for +# * Unknown alphabets, or unknown source types +# * A source type that can contain ambiguous nucleotides, +# while the kmer type does not. In this case, it's standard practise to +# skip these symbols. +function Base.IteratorSize(::Type{T}) where {T <: AbstractKmerIterator} + kT = eltype(T) + sT = source_type(T) + A = Alphabet(kT) + return if sT <: BioSequence && Alphabet(sT) == A + Base.HasLength() + elseif sT <: BioSequence && Alphabet(sT) isa TwoBit && A isa FourBit + Base.HasLength() + elseif sT <: ByteSource && isa Union{FourBit, AminoAcidAlphabet} + Base.HasLength() + else + Base.SizeUnknown() + end +end @noinline throw_bad_byte_error(b::UInt8) = error("Cannot interpret byte $(repr(b)) as nucleotide") diff --git a/src/transformations.jl b/src/transformations.jl index d658b12..59d2188 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -63,7 +63,7 @@ function BioSequences.translate( (_, data) = leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) end result = T(unsafe, data) - if alternative_start && !iszero(ksize(seq)) + if alternative_start && !iszero(ksize(typeof(seq))) return setindex(result, 1, AA_M) else return result @@ -96,7 +96,7 @@ function BioSequences.translate( (_, data) = leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) end result = T(unsafe, data) - if alternative_start && !iszero(ksize(seq)) + if alternative_start && !iszero(ksize(typeof(seq))) return setindex(result, 1, AA_M) else return result From 550dd6e8d51fb26b4d34ef6c0d1bff1d91df9b04 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 2 Oct 2023 18:49:31 +0200 Subject: [PATCH 15/33] Refactor to trait objects --- src/iterators/FwKmers.jl | 171 ++++++++++++++++----------------------- src/iterators/common.jl | 157 ++++++++++++++++++++++++++++++++++- 2 files changed, 226 insertions(+), 102 deletions(-) diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index 44bd0cf..ae512ca 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -1,5 +1,3 @@ -# TODO: Lots of code sharing in this file... can we refactor to be more clever? - """ FwKmers{A <: Alphabet, K, S} @@ -15,12 +13,12 @@ kmers containing symbols not permitted in the 2-bit nucleotide alphabet. # Examples: ```jldoctest -julia> v = collect(EveryDNAMer{3}("AGCGTATA")); +julia> v = collect(FwDNAMers{3}("AGCGTATA")); julia eltype(v), length(v) (Kmer{DNAAlphabet{2}, 3, 1}, 6) -julia> length(collect(EveryRNAMer{3}(rna"UGDCUGAVC"))) +julia> length(collect(FwRNAMers{3}(rna"UGDCUGAVC"))) 2 ``` """ @@ -35,6 +33,7 @@ struct FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} end source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S +load_source(x::FwKmers) = x.seq # Constructors FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)} @@ -53,116 +52,63 @@ end function Base.length(it::FwKmers) Base.IteratorSize(typeof(it)) == Base.HasLength() || throw(MethodError(length, (it,))) - length(it.seq) - ksize(eltype(it)) + 1 -end - -# Generic fallback -function Base.iterate(it::FwKmers{A, K, S}) where {A <: Alphabet, K, S} - seq = it.seq - length(seq) < K && return nothing - data = zero_tuple(eltype(it)) - bps = BioSequences.bits_per_symbol(A()) - @inbounds for i in 1:K - symbol = seq[i] - encoding = UInt(BioSequences.encode(A(), convert(eltype(A), symbol))) - (_, data) = leftshift_carry(data, bps, encoding) - end - kmer = eltype(it)(unsafe, data) - (kmer, (kmer, K+1)) -end - -function Base.iterate(it::FwKmers, state::Tuple{Kmer, Integer}) - seq = it.seq - (kmer, i) = state - i > length(seq) && return nothing - symbol = @inbounds seq[i] - new_kmer = shift(kmer, convert(eltype(A), symbol)) - (new_kmer, (new_kmer, i+1)) -end - -# These methods can carry the encoding directly over. We call into the internal method -# `iterate_copy`, because specifying the precise type constrains (either the same alphabet -# in the sequence and the iterator, OR both have either TwoBit or FourBit) -# is quite hard. -function Base.iterate(it::FwKmers{A, K, <:BioSequence{A}, }) where {A <: Alphabet, K} - iterate_copy(it) -end - -function Base.iterate(it::FwKmers{<:TwoBit, K, <:BioSequence{<:TwoBit}}) where K - iterate_copy(it) -end - -function Base.iterate(it::FwKmers{<:FourBit, K, <:BioSequence{<:FourBit}}) where K - iterate_copy(it) + length(usable_source(it)) - ksize(eltype(it)) + 1 end -function Base.iterate(it::FwKmers{A, K, <:BioSequence{A}}, state::Tuple{Kmer, Integer}) where {A <: Alphabet, K} - iterate_copy(it, state) +function Base.iterate(it::FwKmers, state...) + iterate_kmer(RecodingScheme(typeof(it)), it, state...) end -function Base.iterate(it::FwKmers{<:TwoBit, K, <:BioSequence{<:TwoBit}}, state::Tuple{Kmer, Integer}) where K - iterate_copy(it, state) +# For these recoding schemes, no symbols in the source sequence are skipped. +# Hence, we can forward to just `extract` +@inline function iterate_kmer( + R::Union{GenericAlphabet, Copyable, TwoToFour, AsciiEncode, GenericBytes}, + it::FwKmers +) + src = usable_source(it) + length(src) < ksize(eltype(it)) && return nothing + kmer = extract(R, eltype(it), src, 1) + (kmer, (kmer, ksize(eltype(it))+1)) end -function Base.iterate(it::FwKmers{<:FourBit, K, <:BioSequence{<:FourBit}}, state::Tuple{Kmer, Integer}) where K - iterate_copy(it, state) -end - -@inline function iterate_copy(it::FwKmers{A, K, S}) where {A, K, S} - seq = it.seq - length(seq) < K && return nothing - data = zero_tuple(eltype(it)) - bps = BioSequences.bits_per_symbol(A()) - for i in 1:K - encoding = UInt(BioSequences.extract_encoded_element(seq, i)) - (_, data) = leftshift_carry(data, bps, encoding) - end - kmer = eltype(it)(unsafe, data) - (kmer, (kmer, K+1)) +@inline function iterate_kmer(::GenericAlphabet, it::FwKmers, state::Tuple{Kmer, Integer}) + src = usable_source(it) + (kmer, i) = state + i > length(src) && return nothing + symbol = @inbounds src[i] + new_kmer = shift(kmer, convert(eltype(kmer), symbol)) + (new_kmer, (new_kmer, i+1)) end -@inline function iterate_copy(it::FwKmers, state::Tuple{Kmer, Integer}) - seq = it.seq +@inline function iterate_kmer(::Copyable, it::FwKmers, state::Tuple{Kmer, Integer}) + src = usable_source(it) (kmer, i) = state - i > length(seq) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(seq, i)) + i > length(src) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(src, i)) new_kmer = shift_encoding(kmer, encoding) (new_kmer, (new_kmer, i+1)) end -# These methods can use special 2 -> 4 bit recoding -function Base.iterate(it::FwKmers{<:FourBit, K, S}) where {S <: BioSequence{<:TwoBit}, K} - seq = it.seq - length(seq) < K && return nothing - data = zero_tuple(eltype(it)) - for i in 1:K - encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) - (_, data) = leftshift_carry(data, 4, encoding) - end - kmer = eltype(it)(unsafe, data) - (kmer, (kmer, K+1)) -end - -function Base.iterate(it::FwKmers{<:FourBit, K, S}, state::Tuple{Kmer, Integer}) where {K, S <: BioSequence{<:TwoBit}} - seq = it.seq +@inline function iterate_kmer(::TwoToFour, it::FwKmers, state::Tuple{Kmer, Int}) + src = usable_source(it) (kmer, i) = state - i > length(seq) && return nothing - encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) + i > length(src) && return nothing + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(src, i))) new_kmer = shift_encoding(kmer, encoding) (new_kmer, (new_kmer, i+1)) end -# This is special because, by convention, we skip every ambiguous kmer -# instead of erroring. -function Base.iterate( - it::FwKmers{A, K, S}, state=(zero_kmer(Kmer{A, K}), K, 1) -) where {A <: TwoBit, K, S <: BioSequence{<:FourBit}} +@inline function iterate_kmer( + ::Skipping, + it::FwKmers{A, K}, + state::Tuple{Kmer, Int, Int}=(zero_kmer(Kmer{A, K}), K, 1) +) where {A, K} (kmer, remaining, i) = state - seq = it.seq + src = usable_source(it) while !iszero(remaining) - i > length(seq) && return nothing + i > length(src) && return nothing # TODO: Also, LUT here? - encoding = UInt(BioSequences.extract_encoded_element(seq, i)) + encoding = UInt(BioSequences.extract_encoded_element(src, i)) i += 1 # TODO: Is lookup table faster? remaining = ifelse(isone(count_ones(encoding)), remaining - 1, K) @@ -171,15 +117,38 @@ function Base.iterate( return (kmer, (kmer, 1, i)) end -function Base.iterate( - it::FwKmers{A, K}, state=(zero_kmer(Kmer{A, K}), K, 1) -) where {A <: TwoBit, K} +@inline function iterate_kmer(::GenericBytes, it::FwKmers, state::Tuple{Kmer, Int}) + src = usable_source(it) + Base.require_one_based_indexing(src) + (kmer, i) = state + i > length(src) && return nothing + char = reinterpret(Char, (src[i] % UInt32) << 24) + symbol = eltype(eltype(it))(char) + kmer = shift(kmer, symbol) + return (kmer, (kmer, i+1)) +end + +@inline function iterate_kmer(::AsciiEncode, it::FwKmers, state::Tuple{Kmer, Int}) + src = usable_source(it) + Base.require_one_based_indexing(src) + (kmer, i) = state + i > length(src) && return nothing + encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), @inbounds(src[i])) + kmer = shift_encoding(kmer, encoding) + return (kmer, (kmer, i+1)) +end + +@inline function iterate_kmer( + ::AsciiSkipping, + it::FwKmers{A, K}, + state=(zero_kmer(Kmer{A, K}), K, 1) +) where {A, K} (kmer, remaining, i) = state - seq = it.seq - Base.require_one_based_indexing(seq) + src = usable_source(it) + Base.require_one_based_indexing(src) while !iszero(remaining) - i > length(seq) && return nothing - byte = @inbounds seq[i] + i > length(src) && return nothing + byte = @inbounds src[i] i += 1 encoding = @inbounds BYTE_LUT[byte + 0x01] encoding == 0xff && throw_bad_byte_error(byte) @@ -187,4 +156,4 @@ function Base.iterate( kmer = shift_encoding(kmer, encoding % UInt) end return (kmer, (kmer, 1, i)) -end +end \ No newline at end of file diff --git a/src/iterators/common.jl b/src/iterators/common.jl index 69f0518..95f3baf 100644 --- a/src/iterators/common.jl +++ b/src/iterators/common.jl @@ -1,3 +1,14 @@ +""" + AbstractKmerIterator{A <: Alphabet, K} + +Iterates `Kmer{A, K}`. +Functions to implement: +* `Base.iterate` + +Optional functions: +* `source_type` +* `load_source` +""" abstract type AbstractKmerIterator{A <: Alphabet, K} end function Base.eltype(::Type{<:AbstractKmerIterator{A, K}}) where {A, K} @@ -15,6 +26,29 @@ Get the type of the data source that kmers are extracted from """ function source_type end +""" + load_source(x::AbstractKmerIterator)::source_type(typeof(x)) + +Get the data source from the kmer iterator. +""" +function load_source end + +""" +usable_source(x::AbstractKmerIterator) + +Convert the source object into whatever is used by the iterator protocol +""" +function usable_source(x::AbstractKmerIterator)::Union{BioSequence, AbstractVector{UInt8}} + loaded = load_source(x) + return if loaded isa Union{BioSequence, AbstractVector{UInt8}} + loaded + elseif loaded isa Union{String, SubString{String}} + codeunits(loaded) + else + error("Does not know how to load data from source") + end +end + # We can't compute the length for # * Unknown alphabets, or unknown source types # * A source type that can contain ambiguous nucleotides, @@ -35,9 +69,70 @@ function Base.IteratorSize(::Type{T}) where {T <: AbstractKmerIterator} end end +"""Trait object which based on static dispatch determines how to recode from +the encoding of the source sequence to the encoding of the kmer""" +abstract type RecodingScheme end + +"We can copy the encoding straight from the source to the kmer" +struct Copyable <: RecodingScheme end + +"We can copy the encoding, then bitshift to create 4-bit encoding" +struct TwoToFour <: RecodingScheme end + +"We skip all symbols in source that contain unmappable symbols" +struct Skipping <: RecodingScheme end + +"We can use `BioSequences.ascii_encode`" +struct AsciiEncode <: RecodingScheme end + +"The source is bytes, but we need our own encoding table, +since we must skip ambiguous nucleotides" +struct AsciiSkipping <: RecodingScheme end + +"The source is a bytevector, and we have no static knowledge of efficient +conversion to the right encoding" +struct GenericBytes <: RecodingScheme end + +"Generic fallback when the source is a `BioSequence`" +struct GenericAlphabet <: RecodingScheme end + +function RecodingScheme(::Type{T})::RecodingScheme where {T <: AbstractKmerIterator} + A = Alphabet(eltype(T)) + sT = source_type(T) + return if sT <: BioSequence + As = Alphabet(sT) + if As == A + Copyable() + elseif As isa TwoBit && A isa TwoBit + Copyable() + elseif As isa FourBit && A isa FourBit + Copyable() + elseif As isa FourBit && A isa TwoBit + Skipping() + elseif As isa TwoBit && A isa FourBit + TwoToFour() + else + GenericAlphabet() + end + elseif sT <: ByteSource + codetype = BioSequences.codetype(A) + return if codetype isa BioSequences.AsciiAlphabet + if A isa TwoBit + AsciiSkipping() + else + AsciiEncode() + end + else + return GenericBytes() + end + else + error("Cannot determine recoding scheme of iterator") + end +end + @noinline throw_bad_byte_error(b::UInt8) = error("Cannot interpret byte $(repr(b)) as nucleotide") -const BYTE_LUT = let +const ASCII_SKIPPING_LUT = let v = fill(0xff, 256) for (i, s) in [(0, "Aa"), (1, "cC"), (2, "gG"), (3, "TtUu")], c in s v[UInt8(c) + 1] = i @@ -47,4 +142,64 @@ const BYTE_LUT = let v[UInt8(lowercase(c)) + 1] = 0xf0 end Tuple(v) +end + +"Extract a full kmer at a given index of a sequence. +Note: These methods don't do any bounds checking" +function extract end +# TODO: Use extract elsewhere in this code base, e.g. kmer from string instantiation? + +@inline function extract(::GenericAlphabet, ::Type{T}, seq::BioSequence, from_index) where {T <: Kmer} + length(seq) < ksize(T) && return nothing + data = zero_tuple(T) + A = Alphabet(T) + bps = BioSequences.bits_per_symbol(A) + @inbounds for i in 1:ksize(T) + symbol = seq[i] + encoding = UInt(BioSequences.encode(A, convert(eltype(A), symbol)))::UInt + (_, data) = leftshift_carry(data, bps, encoding) + end + T(unsafe, data) +end + +@inline function extract(::TwoToFour, ::Type{T}, seq::BioSequence, from_index) where {T <: Kmer} + length(seq) < ksize(T) && return nothing + data = zero_tuple(T) + for i in 1:ksize(T) + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) + (_, data) = leftshift_carry(data, 4, encoding) + end + T(unsafe, data) +end + +@inline function extract(::Copyable, ::Type{T}, seq::BioSequence, from_index) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(seq)) + for i in from_index:from_index + ksize(T) - 1 + encoding = UInt(BioSequences.extract_encoded_element(seq, i)) + (_, data) = leftshift_carry(data, bps, encoding) + end + T(unsafe, data) +end + +@inline function extract(::AsciiEncode, ::Type{T}, seq::AbstractVector{UInt8}, from_index) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(kT)) + @inbounds for i in from_index:from_index + ksize(T) - 1 + encoding = BioSequences.ascii_encode(Alphabet(T), seq[i]) + (_, data) = leftshift_carry(data, bps, encoding) + end + T(unsafe, data) +end + +@inline function extract(::GenericBytes, ::Type{T}, seq::AbstractVector{UInt8}, from_index) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(T)) + @inbounds for i in 1:ksize(T) + char = reinterpret(Char, (seq[i] % UInt32) << 24) + symbol = eltype(T)(char) + encoding = UInt(BioSequences.encode(Alphabet(T), symbol))::UInt + (_, data) = leftshift_carry(data, bps, encoding) + end + T(unsafe, data) end \ No newline at end of file From 726021b1e7a37716d3a88090daf4575fa450026d Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 2 Oct 2023 19:55:56 +0200 Subject: [PATCH 16/33] Extensive refactoring --- src/Kmers.jl | 9 +- src/indexing.jl | 10 +- src/iterators/CanonicalKmers.jl | 200 ++++++++++++++++++++++++++++ src/iterators/FwCanonicalKmers.jl | 87 ------------ src/iterators/FwKmers.jl | 52 ++++---- src/iterators/SpacedKmers.jl | 13 +- src/iterators/common.jl | 54 ++++++-- src/kmer.jl | 70 ++++++---- src/transformations.jl | 36 ++--- src/tuple_bitflipping.jl | 13 +- test/construction_and_conversion.jl | 5 +- test/runtests.jl | 6 +- 12 files changed, 363 insertions(+), 192 deletions(-) create mode 100644 src/iterators/CanonicalKmers.jl delete mode 100644 src/iterators/FwCanonicalKmers.jl diff --git a/src/Kmers.jl b/src/Kmers.jl index 6011f0a..1689179 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -7,20 +7,15 @@ # License is MIT: https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE module Kmers -export - Kmer, - +export Kmer, DNAKmer, RNAKmer, AAKmer, - DNACodon, RNACodon, - ReverseGeneticCode, reverse_translate, reverse_translate!, - @mer_str, # Immutable operations @@ -140,7 +135,7 @@ include("revtrans.jl") include("iterators/common.jl") include("iterators/FwKmers.jl") -include("iterators/FwCanonicalKmers.jl") +include("iterators/CanonicalKmers.jl") include("iterators/SpacedKmers.jl") end # module diff --git a/src/indexing.jl b/src/indexing.jl index 2017985..58028ad 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -10,19 +10,20 @@ end # This is usually type unstable, but in user code, users may use constant-folded ranges, # e.g. f(x) = x[2:4]. In this case, we need it to compile to very efficient code. # Hence, it MUST use @inline -@inline function Base.getindex(kmer::Kmer{A}, range::AbstractRange{<:Integer}) where A +@inline function Base.getindex(kmer::Kmer{A}, range::AbstractRange{<:Integer}) where {A} @boundscheck checkbounds(kmer, range) T = derive_type(Kmer{A, length(range)}) data = zero_tuple(T) nbits = BioSequences.bits_per_symbol(A()) for i in range - (_, data) = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) + (_, data) = + leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) end T(unsafe, data) end # Same as above: This needs to be able to inline if the indices are known statically -@inline function Base.getindex(kmer::Kmer{A}, indices::AbstractVector{Bool}) where A +@inline function Base.getindex(kmer::Kmer{A}, indices::AbstractVector{Bool}) where {A} @boundscheck checkbounds(eachindex(kmer), indices) K = sum(indices) N = n_coding_elements(Kmer{A, K}) @@ -31,7 +32,8 @@ end nbits = BioSequences.bits_per_symbol(A()) for (i, bool) in enumerate(indices) bool || continue - (_, data) = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) + (_, data) = + leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) end T(unsafe, data) end diff --git a/src/iterators/CanonicalKmers.jl b/src/iterators/CanonicalKmers.jl new file mode 100644 index 0000000..960cf7d --- /dev/null +++ b/src/iterators/CanonicalKmers.jl @@ -0,0 +1,200 @@ +struct CanonicalKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} + it::FwKmers{A, K, S} +end + +source_type(::Type{CanonicalKmers{A, K, S}}) where {A, K, S} = S +load_source(x::CanonicalKmers) = x.it.seq +Base.length(it::CanonicalKmers) = length(it.it) + +# Constructors +function CanonicalKmers{A, K}(s::S) where {S, A <: Alphabet, K} + CanonicalKmers{S, A, K}(FwKmers{A, K}(s)) +end + +# Iteration +function Base.iterate(it::CanonicalKmers, state...) + iterate_kmer(RecodingScheme(typeof(it)), it, state...) +end + +# For these recoding schemes, no symbols in the source sequence are skipped. +# Hence, we can forward to just `extract`. +# Here, instead of reverse complementing each symbol, it's more efficient +# to do it in bulk by RC'ing the entire kmer +@inline function iterate_kmer( + R::Union{GenericAlphabet, Copyable, TwoToFour, AsciiEncode, GenericBytes}, + it::CanonicalKmers, +) + src = usable_source(it) + length(src) < ksize(eltype(it)) && return nothing + fw = extract(R, eltype(it), src, 1) + rv = reverse_complement(fw) + (min(fw, rv), (fw, rv, ksize(eltype(it)) + 1)) +end + +# Fallback: Just because it's Copyable doesn't mean we have neat bit-tricks +# to RC the encoding +@inline function iterate_kmer( + ::Union{GenericAlphabet, Copyable}, + it::CanonicalKmers, + state::Tuple{Kmer, Kmer, Int}, +) + src = usable_source(it) + (fw, rv, i) = state + i > length(src) && return nothing + symbol = @inbounds src[i] + encoding = UInt(BioSequences.encode(Alphabet(typeof(fw)), symbol))::UInt + rc_encoding = UInt(BioSequences.encode(Alphabet(typeof(fw)), complement(symbol)))::UInt + fw = shift_encoding(kmer, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rc), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::Copyable, + it::CanonicalKmers{<:TwoBit, K, <:TwoBit}, + state::Tuple{Kmer, Kmer, Int}, +) where {K} + src = usable_source(it) + (fw, rv, i) = state + i > length(src) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(src, i))::UInt + rc_encoding = encoding ⊻ UInt(3) + fw = shift_encoding(kmer, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rc), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::Copyable, + it::CanonicalKmers{<:FourBit, K, <:FourBit}, + state::Tuple{Kmer, Kmer, Int}, +) where {K} + src = usable_source(it) + (fw, rv, i) = state + i > length(src) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(src, i))::UInt + rc_encoding = (@inbounds(FOURBIT_COMPLEMENT_LUT[encoding + UInt(1)])) % UInt + fw = shift_encoding(kmer, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rc), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::TwoToFour, + it::CanonicalKmers, + state::Tuple{Kmer, Kmer, Int}, +) + src = usable_source(it) + (fw, rv, i) = state + i > length(src) && return nothing + twobit_encoding = UInt(BioSequences.extract_encoded_element(src, i))::UInt + fw_encoding = reinterpret(UInt8, decode(Alphabet(fw), twobit_encoding)) % UInt + rc_encoding = reinterpret(UInt8, decode(Alphabet(fw), twobit_encoding ⊻ UInt(3))) % UInt + fw = shift_encoding(kmer, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rc), (fw, rv, i + 1)) +end + +# 4 -> 2 (skipping): Ascii skipping LUT, as with FwKmers - DEFAULT STATE +@inline function iterate_kmer( + ::Skipping, + it::CanonicalKmers{A, K}, + state::Tuple{Kmer, Kmer, Int, Int}=(zero_kmer(Kmer{A, K}), zero_kmer(Kmer{A, K}), K, 1), +) where {A, K} + src = usable_source(it) + (fw, rv, remaining, i) = state + while !iszero(remaining) + i > length(src) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(src, i))::UInt + i += 1 + if isone(count_ones(encoding)) + fw_encoding = trailing_zeros(encoding) % UInt + fw = shift_encoding(fw, fw_encoding) + rv = shift_first_encoding(rv, fw_encoding ⊻ UInt(3)) + remaining -= 1 + else + remaining = K + # No need to RC anything + continue + end + end + return (min(fw, rv), (fw, rv, 1, i)) +end + +@inline function iterate_kmer( + ::AsciiSkipping, + it::CanonicalKmers{A, K}, + state::Tuple{Kmer, Kmer, Int, Int}=(zero_kmer(Kmer{A, K}), zero_kmer(Kmer{A, K}), K, 1), +) where {A, K} + src = usable_source(it) + Base.require_one_based_indexing(src) + (fw, rv, remaining, i) = state + while !iszero(remaining) + i > length(src) && return nothing + byte = @inbounds src[i] + i += 1 + encoding = @inbounds BYTE_LUT[byte + 0x01] + encoding == 0xff && throw_bad_byte_error(byte) + if encoding == 0xf0 + remaining = K + continue + else + fw = shift_encoding(fw, encoding) + rv = shift_first_encoding(rv, encoding ⊻ UInt(3)) + remaining -= 1 + end + end + return (min(fw, rv), (fw, rv, 1, i)) +end + +@inline function iterate_kmer( + ::AsciiEncode, + it::CanonicalKmers, + state::Tuple{Kmer, Kmer, Int}, +) + src = usable_source(it) + Base.require_one_based_indexing(src) + (fw, rv, i) = state + A = Alphabet(typeof(fw)) + i > length(src) && return nothing + encoding = UInt(BioSequences.ascii_encode(A, @inbounds(src[i])))::UInt + rc_encoding = + UInt(BioSequences.encode(A, complement(BioSequences.decode(A, encoding))))::UInt + fw = shift_encoding(kmer, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rc), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::AsciiEncode, + it::CanonicalKmers{<:FourBit}, + state::Tuple{Kmer, Kmer, Int}, +) + src = usable_source(it) + Base.require_one_based_indexing(src) + (fw, rv, i) = state + A = Alphabet(typeof(fw)) + i > length(src) && return nothing + encoding = UInt(BioSequences.ascii_encode(A, @inbounds(src[i])))::UInt + rc_encoding = @inbounds(FOURBIT_COMPLEMENT_LUT[encoding + 0x01]) % UInt + fw = shift_encoding(kmer, encoding) + rv = shift_first_encoding(rv, rc_encoding) + (min(fw, rc), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::GenericBytes, + it::CanonicalKmers, + state::Tuple{Kmer, Kmer, Int}, +) + src = usable_source(it) + Base.require_one_based_indexing(src) + (fw, rv, i) = state + i > length(src) && return nothing + char = reinterpret(Char, (src[i] % UInt32) << 24) + fw_symbol = eltype(fw)(char) + rc_symbol = complement(fw_symbol) + fw = shift(fw, fw_symbol) + rv = shift(rv, rc_symbol) + (min(fw, rc), (fw, rv, i + 1)) +end diff --git a/src/iterators/FwCanonicalKmers.jl b/src/iterators/FwCanonicalKmers.jl deleted file mode 100644 index 80c60ad..0000000 --- a/src/iterators/FwCanonicalKmers.jl +++ /dev/null @@ -1,87 +0,0 @@ -struct FwCanonicalKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} - it::FwKmers{A, K, S} -end - -const SameFwCanonicalKmers{A, K, S} = FwCanonicalKmers{S, A, K} where {A, S <: BioSequence{A}} - -source_type(::Type{FwCanonicalKmers{A, K, S}}) where {A, K, S} = S - -function FwCanonicalKmers{K}(s::BioSequence) where K - S = typeof(s) - A = typeof(Alphabet(S)) - it = FwKmers{S, A, K}(s) - FwCanonicalKmers{S, A, K}(it) -end - -function FwCanonicalKmers{A, K}(s::S) where {S <: BioSequence, A <: Alphabet, K} - FwCanonicalKmers{S, A, K}(FwKmers{A, K}(s)) -end - -function FwCanonicalKmers{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} - s2 = codeunits(s) - FwCanonicalKmers{typeof(s2), A, K}(s2) -end - -Base.length(it::SameFwCanonicalKmers) = length(it.it) - -# Generic iterator for the first element: I think we can do no better than to reverse-complement -# the entire kmer. However, for the following iterations, it's faster to add a single basepair to -# the RC'd kmer than to RC it from scratch, hence we need specialized methods for efficient RC'ing -# of individual bases. -function Base.iterate(it::FwCanonicalKmers{S, A, K}) where {S, A, K} - itval = iterate(it.it) - itval === nothing && return nothing - fw = first(itval) - rv = reverse_complement(fw) - (min(fw, rv), (fw, rv, K+1)) -end - -# Generic fallback -function Base.iterate( - it::FwCanonicalKmers{S, A, K}, - state::Tuple{Kmer, Kmer, Integer} -) where {S, A, K} - seq = it.it.seq - (fw, rv, i) = state - i > length(seq) && return nothing - symbol = @inbounds seq[i] - encoding = UInt(BioSequences.encode(A, symbol)) - rc_encoding = UInt(BioSequences.encode(A, complement(symbol))) - fw = shift_encoding(fw, encoding) - rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rv), (fw, rv, i+1)) -end - -# Special method for 2bit -> 2bit -function Base.iterate( - it::FwCanonicalKmers{S, A, K}, - state::Tuple{Kmer, Kmer, Integer} -) where {K, A <: TwoBit, S <: BioSequence{A}} - seq = it.it.seq - (fw, rv, i) = state - i > length(seq) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(seq, i)) - rc_encoding = encoding ⊻ 0x3 - fw = shift_encoding(fw, encoding) - rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rv), (fw, rv, i+1)) -end - -# Special method for 2bit -> 4bit -function Base.iterate( - it::FwCanonicalKmers{S, A, K}, - state::Tuple{Kmer, Kmer, Integer} -) where {K, A <: FourBit, S <: BioSequence{A}} - seq = it.it.seq - (fw, rv, i) = state - i > length(seq) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(seq, i)) - # Reverse-complementing like this is surprisingly inefficient. - # We may want to consider either using a 16-element LUT, or - # else simply changing the algorithm such that the whole kmer - # is reverse-complemented at every iteration - rc_encoding = reinterpret(UInt8, complement(reinterpret(DNA, encoding % UInt8))) % UInt - fw = shift_encoding(fw, encoding) - rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rv), (fw, rv, i+1)) -end diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index ae512ca..b9dff05 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -35,25 +35,21 @@ end source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S load_source(x::FwKmers) = x.seq +function Base.length(it::FwKmers) + Base.IteratorSize(typeof(it)) == Base.HasLength() || throw(MethodError(length, (it,))) + length(usable_source(it)) - ksize(eltype(it)) + 1 +end + # Constructors -FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)} const FwDNAMers{K, S} = FwKmers{DNAAlphabet{2}, K, S} const FwRNAMers{K, S} = FwKmers{RNAAlphabet{2}, K, S} const FwAAMers{K, S} = FwKmers{AminoAcidAlphabet, K, S} -FwDNAMers{K}(s) where K = FwDNAMers{K, typeof(s), }(s) -FwRNAMers{K}(s) where K = FwRNAMers{K, typeof(s)}(s) -FwAAMers{K}(s) where K = FwAAMers{K, typeof(s)}(s) - -function FwKmers{A, K}(s::S) where {S <: Union{String, SubString{String}}, A <: Alphabet, K} - s2 = codeunits(s) - FwKmers{A, K, typeof(s2)}(s2) -end +FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)} -function Base.length(it::FwKmers) - Base.IteratorSize(typeof(it)) == Base.HasLength() || throw(MethodError(length, (it,))) - length(usable_source(it)) - ksize(eltype(it)) + 1 -end +FwDNAMers{K}(s) where {K} = FwDNAMers{K, typeof(s)}(s) +FwRNAMers{K}(s) where {K} = FwRNAMers{K, typeof(s)}(s) +FwAAMers{K}(s) where {K} = FwAAMers{K, typeof(s)}(s) function Base.iterate(it::FwKmers, state...) iterate_kmer(RecodingScheme(typeof(it)), it, state...) @@ -63,30 +59,30 @@ end # Hence, we can forward to just `extract` @inline function iterate_kmer( R::Union{GenericAlphabet, Copyable, TwoToFour, AsciiEncode, GenericBytes}, - it::FwKmers + it::FwKmers, ) src = usable_source(it) length(src) < ksize(eltype(it)) && return nothing kmer = extract(R, eltype(it), src, 1) - (kmer, (kmer, ksize(eltype(it))+1)) + (kmer, (kmer, ksize(eltype(it)) + 1)) end -@inline function iterate_kmer(::GenericAlphabet, it::FwKmers, state::Tuple{Kmer, Integer}) +@inline function iterate_kmer(::GenericAlphabet, it::FwKmers, state::Tuple{Kmer, Int}) src = usable_source(it) (kmer, i) = state i > length(src) && return nothing symbol = @inbounds src[i] new_kmer = shift(kmer, convert(eltype(kmer), symbol)) - (new_kmer, (new_kmer, i+1)) + (new_kmer, (new_kmer, i + 1)) end -@inline function iterate_kmer(::Copyable, it::FwKmers, state::Tuple{Kmer, Integer}) +@inline function iterate_kmer(::Copyable, it::FwKmers, state::Tuple{Kmer, Int}) src = usable_source(it) (kmer, i) = state i > length(src) && return nothing encoding = UInt(BioSequences.extract_encoded_element(src, i)) new_kmer = shift_encoding(kmer, encoding) - (new_kmer, (new_kmer, i+1)) + (new_kmer, (new_kmer, i + 1)) end @inline function iterate_kmer(::TwoToFour, it::FwKmers, state::Tuple{Kmer, Int}) @@ -95,22 +91,20 @@ end i > length(src) && return nothing encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(src, i))) new_kmer = shift_encoding(kmer, encoding) - (new_kmer, (new_kmer, i+1)) + (new_kmer, (new_kmer, i + 1)) end @inline function iterate_kmer( ::Skipping, it::FwKmers{A, K}, - state::Tuple{Kmer, Int, Int}=(zero_kmer(Kmer{A, K}), K, 1) + state::Tuple{Kmer, Int, Int}=(zero_kmer(Kmer{A, K}), K, 1), ) where {A, K} (kmer, remaining, i) = state src = usable_source(it) while !iszero(remaining) i > length(src) && return nothing - # TODO: Also, LUT here? encoding = UInt(BioSequences.extract_encoded_element(src, i)) i += 1 - # TODO: Is lookup table faster? remaining = ifelse(isone(count_ones(encoding)), remaining - 1, K) kmer = shift_encoding(kmer, trailing_zeros(encoding) % UInt) end @@ -125,8 +119,8 @@ end char = reinterpret(Char, (src[i] % UInt32) << 24) symbol = eltype(eltype(it))(char) kmer = shift(kmer, symbol) - return (kmer, (kmer, i+1)) -end + return (kmer, (kmer, i + 1)) +end @inline function iterate_kmer(::AsciiEncode, it::FwKmers, state::Tuple{Kmer, Int}) src = usable_source(it) @@ -135,13 +129,13 @@ end i > length(src) && return nothing encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), @inbounds(src[i])) kmer = shift_encoding(kmer, encoding) - return (kmer, (kmer, i+1)) + return (kmer, (kmer, i + 1)) end @inline function iterate_kmer( ::AsciiSkipping, it::FwKmers{A, K}, - state=(zero_kmer(Kmer{A, K}), K, 1) + state=(zero_kmer(Kmer{A, K}), K, 1), ) where {A, K} (kmer, remaining, i) = state src = usable_source(it) @@ -150,10 +144,10 @@ end i > length(src) && return nothing byte = @inbounds src[i] i += 1 - encoding = @inbounds BYTE_LUT[byte + 0x01] + encoding = @inbounds ASCII_SKIPPING_LUT[byte + 0x01] encoding == 0xff && throw_bad_byte_error(byte) remaining = ifelse(encoding == 0xf0, K, remaining - 1) kmer = shift_encoding(kmer, encoding % UInt) end return (kmer, (kmer, 1, i)) -end \ No newline at end of file +end diff --git a/src/iterators/SpacedKmers.jl b/src/iterators/SpacedKmers.jl index ffa96ef..48093d7 100644 --- a/src/iterators/SpacedKmers.jl +++ b/src/iterators/SpacedKmers.jl @@ -1,5 +1,3 @@ -# TODO: Can the FW kmers simply be an alias for SpacedKmers with step = 1 - """ SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} @@ -28,7 +26,7 @@ struct SpacedKmers{A <: Alphabet, K, St, S} <: AbstractKmerIterator{A, K} function SpacedKmers{A, K, St, S}(seq) where {A, K, St, S} (K isa Int && K > 0) || error("K must be an Int > 0") - (St isa Int && St > 0) || error("St must be an Int > 0") + (St isa Int && St > 0) || error("St must be an Int > 0") new{A, K, St, S}(seq) end end @@ -78,14 +76,17 @@ end (_, data) = leftshift_carry(data, bps, encoding) state += 1 remaining -= 1 - iszero(remaining) && return (eltype(it)(unsafe, data), state + max(0, St-K)) + iszero(remaining) && return (eltype(it)(unsafe, data), state + max(0, St - K)) end end # Called when St < K, and the encoding in seq matches that of the kmer. # We can copy the encoding right over, and we need to preserve some data in the kmer # between iterations -@inline function iterate_copy_mask(it::SpacedKmers{A, K, St}, state::Tuple{Int, Tuple{Vararg{UInt}}}) where {A, K, St} +@inline function iterate_copy_mask( + it::SpacedKmers{A, K, St}, + state::Tuple{Int, Tuple{Vararg{UInt}}}, +) where {A, K, St} seq = it.seq len = length(seq) bps = BioSequences.bits_per_symbol(A()) @@ -110,4 +111,4 @@ end # 4 -> 2 bit # 2 -> 4 bit? # Byte sequence: 2 bit -# Byte sequence: other alphabets \ No newline at end of file +# Byte sequence: other alphabets diff --git a/src/iterators/common.jl b/src/iterators/common.jl index 95f3baf..dcba46d 100644 --- a/src/iterators/common.jl +++ b/src/iterators/common.jl @@ -62,7 +62,7 @@ function Base.IteratorSize(::Type{T}) where {T <: AbstractKmerIterator} Base.HasLength() elseif sT <: BioSequence && Alphabet(sT) isa TwoBit && A isa FourBit Base.HasLength() - elseif sT <: ByteSource && isa Union{FourBit, AminoAcidAlphabet} + elseif sT <: ByteSource && A isa Union{FourBit, AminoAcidAlphabet} Base.HasLength() else Base.SizeUnknown() @@ -130,7 +130,8 @@ function RecodingScheme(::Type{T})::RecodingScheme where {T <: AbstractKmerItera end end -@noinline throw_bad_byte_error(b::UInt8) = error("Cannot interpret byte $(repr(b)) as nucleotide") +@noinline throw_bad_byte_error(b::UInt8) = + error("Cannot interpret byte $(repr(b)) as nucleotide") const ASCII_SKIPPING_LUT = let v = fill(0xff, 256) @@ -144,12 +145,25 @@ const ASCII_SKIPPING_LUT = let Tuple(v) end +const FOURBIT_COMPLEMENT_LUT = let + v = fill(0x00, 16) + for i in alphabet(DNA) + v[reinterpret(UInt8, i) + 0x01] = reinterpret(UInt8, complement(i)) + end + Tuple(v) +end + "Extract a full kmer at a given index of a sequence. Note: These methods don't do any bounds checking" function extract end # TODO: Use extract elsewhere in this code base, e.g. kmer from string instantiation? -@inline function extract(::GenericAlphabet, ::Type{T}, seq::BioSequence, from_index) where {T <: Kmer} +@inline function extract( + ::GenericAlphabet, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} length(seq) < ksize(T) && return nothing data = zero_tuple(T) A = Alphabet(T) @@ -162,7 +176,12 @@ function extract end T(unsafe, data) end -@inline function extract(::TwoToFour, ::Type{T}, seq::BioSequence, from_index) where {T <: Kmer} +@inline function extract( + ::TwoToFour, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} length(seq) < ksize(T) && return nothing data = zero_tuple(T) for i in 1:ksize(T) @@ -172,27 +191,42 @@ end T(unsafe, data) end -@inline function extract(::Copyable, ::Type{T}, seq::BioSequence, from_index) where {T <: Kmer} +@inline function extract( + ::Copyable, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} data = zero_tuple(T) bps = BioSequences.bits_per_symbol(Alphabet(seq)) - for i in from_index:from_index + ksize(T) - 1 + for i in from_index:(from_index + ksize(T) - 1) encoding = UInt(BioSequences.extract_encoded_element(seq, i)) (_, data) = leftshift_carry(data, bps, encoding) end T(unsafe, data) end -@inline function extract(::AsciiEncode, ::Type{T}, seq::AbstractVector{UInt8}, from_index) where {T <: Kmer} +@inline function extract( + ::AsciiEncode, + ::Type{T}, + seq::AbstractVector{UInt8}, + from_index, +) where {T <: Kmer} data = zero_tuple(T) bps = BioSequences.bits_per_symbol(Alphabet(kT)) - @inbounds for i in from_index:from_index + ksize(T) - 1 + @inbounds for i in from_index:(from_index + ksize(T) - 1) encoding = BioSequences.ascii_encode(Alphabet(T), seq[i]) (_, data) = leftshift_carry(data, bps, encoding) end T(unsafe, data) end -@inline function extract(::GenericBytes, ::Type{T}, seq::AbstractVector{UInt8}, from_index) where {T <: Kmer} +@inline function extract( + ::GenericBytes, + ::Type{T}, + seq::AbstractVector{UInt8}, + from_index, +) where {T <: Kmer} data = zero_tuple(T) bps = BioSequences.bits_per_symbol(Alphabet(T)) @inbounds for i in 1:ksize(T) @@ -202,4 +236,4 @@ end (_, data) = leftshift_carry(data, bps, encoding) end T(unsafe, data) -end \ No newline at end of file +end diff --git a/src/kmer.jl b/src/kmer.jl index 60a87ec..5eb940d 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -96,13 +96,13 @@ end @inline n_unused(::Type{<:Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K @inline bits_unused(T::Type{<:Kmer}) = n_unused(T) * BioSequences.bits_per_symbol(T) -@inline BioSequences.Alphabet(::Kmer{A}) where A = A() +@inline BioSequences.Alphabet(::Kmer{A}) where {A} = A() @inline function n_coding_elements(::Type{<:Kmer{A, K}}) where {A, K} cld(BioSequences.bits_per_symbol(A()) * K, 8 * sizeof(UInt)) end -@inline function per_word_capacity(::Type{<:Kmer{A}}) where A +@inline function per_word_capacity(::Type{<:Kmer{A}}) where {A} div(8 * sizeof(UInt), BioSequences.bits_per_symbol(A())) end @@ -114,7 +114,8 @@ end per_word_capacity(Kmer{A, K, N}) - n_unused(Kmer{A, K, N}) end -@inline derive_type(::Type{Kmer{A, K}}) where {A, K} = Kmer{A, K, n_coding_elements(Kmer{A, K})} +@inline derive_type(::Type{Kmer{A, K}}) where {A, K} = + Kmer{A, K, n_coding_elements(Kmer{A, K})} ################################################ # Constructors @@ -129,7 +130,11 @@ function zero_kmer(T::Type{Kmer{A, K}}) where {A, K} end # Generic, unknown size -@inline function construct_generic(::Base.SizeUnknown, T::Type{<:Kmer{A, K}}, itr) where {A, K} +@inline function construct_generic( + ::Base.SizeUnknown, + T::Type{<:Kmer{A, K}}, + itr, +) where {A, K} check_kmer(T) data = zero_tuple(T) nbits = BioSequences.bits_per_symbol(A()) @@ -143,7 +148,11 @@ end end # Generic, size known -@inline function construct_generic_unchecked(::Union{Base.HasLength, Base.HasShape}, T::Type{<:Kmer{A}}, itr) where A +@inline function construct_generic_unchecked( + ::Union{Base.HasLength, Base.HasShape}, + T::Type{<:Kmer{A}}, + itr, +) where {A} check_kmer(T) data = zero_tuple(T) nbits = BioSequences.bits_per_symbol(A()) @@ -156,7 +165,11 @@ end end # Generic, size known but length not checked. -@inline function construct_generic(iT::Union{Base.HasLength, Base.HasShape}, T::Type{<:Kmer{A, K}}, itr) where {A, K} +@inline function construct_generic( + iT::Union{Base.HasLength, Base.HasShape}, + T::Type{<:Kmer{A, K}}, + itr, +) where {A, K} length(itr) == K || error("Length of sequence must be K elements to build Kmer") construct_generic_unchecked(iT, T, itr) end @@ -164,13 +177,16 @@ end # BioSequences with the same Alphabet and these element types do not need to decode # and encode, but can copy the raw bits directly into the kmer @inline function construct_unchecked( - T::Type{<:Kmer{A}}, s::BioSequence{A}, data_eltype::Type{E} + T::Type{<:Kmer{A}}, + s::BioSequence{A}, + data_eltype::Type{E}, ) where {A <: Alphabet, E <: Union{UInt8, UInt16, UInt32, UInt}} check_kmer(T) data = zero_tuple(T) nbits = BioSequences.bits_per_symbol(A()) for i in 1:ksize(T) - (_, data) = leftshift_carry(data, nbits, BioSequences.extract_encoded_element(s, i) % UInt) + (_, data) = + leftshift_carry(data, nbits, BioSequences.extract_encoded_element(s, i) % UInt) end T(unsafe, data) end @@ -178,7 +194,11 @@ end # With LongSequence of the same alphabet, entire coding elements can be copied # directly. # TODO: Test that LongSequence and LongSubSeq encoded_data_eltype is UInt -@inline function construct_unchecked(T::Type{<:Kmer{A}}, s::LongSequence{A}, data_eltype::Type{UInt}) where {A <: Alphabet} +@inline function construct_unchecked( + T::Type{<:Kmer{A}}, + s::LongSequence{A}, + data_eltype::Type{UInt}, +) where {A <: Alphabet} check_kmer(T) Bps = BioSequences.BitsPerSymbol(A()) data = ntuple(i -> BioSequences.reversebits(@inbounds(s.data[i]), Bps), Val{nsize(T)}()) @@ -215,8 +235,8 @@ end # BioSequence: Various missing type parameters Kmer{A, K}(s::BioSequence) where {A, K} = derive_type(Kmer{A, K})(s) -Kmer{A}(s::BioSequence) where A = derive_type(Kmer{A, length(s)})(s) -Kmer(s::BioSequence{A}) where A = derive_type(Kmer{A, length(s)})(s) +Kmer{A}(s::BioSequence) where {A} = derive_type(Kmer{A, length(s)})(s) +Kmer(s::BioSequence{A}) where {A} = derive_type(Kmer{A, length(s)})(s) # Iterators: Various missing type parameters. # It's too impractical to construct a kmer before we know the value of K, @@ -230,15 +250,15 @@ function Kmer{A, K}(iT::Union{Base.HasLength, Base.HasShape}, itr) where {A, K} construct_generic_unchecked(iT, derive_type(Kmer{A, K}), itr) end -Kmer{A}(itr) where A = Kmer{A}(Base.IteratorSize(itr), itr) -Kmer{A}(::Base.SizeUnknown, itr) where A = Kmer{A}(vec(collect(itr))) +Kmer{A}(itr) where {A} = Kmer{A}(Base.IteratorSize(itr), itr) +Kmer{A}(::Base.SizeUnknown, itr) where {A} = Kmer{A}(vec(collect(itr))) -function Kmer{A}(iT::Union{Base.HasLength, Base.HasShape}, itr) where A +function Kmer{A}(iT::Union{Base.HasLength, Base.HasShape}, itr) where {A} construct_generic_unchecked(iT, derive_type(Kmer{A, length(itr)}), itr) end # Strings: Various missing type parameters -function Kmer{A}(s::Union{String, SubString{String}}) where A +function Kmer{A}(s::Union{String, SubString{String}}) where {A} construct_generic_unchecked(Base.HasLength(), derive_type(Kmer{A, length(s)}), s) end @@ -302,7 +322,7 @@ function Base.print(io::IO, s::Kmer) end Base.cmp(x::T, y::T) where {T <: Kmer} = cmp(x.data, y.data) -Base.:(==)(x::Kmer{A}, y::Kmer{A}) where A = x.data == y.data +Base.:(==)(x::Kmer{A}, y::Kmer{A}) where {A} = x.data == y.data Base.isless(x::T, y::T) where {T <: Kmer} = isless(x.data, y.data) Base.isequal(x::Kmer, y::BioSequence) = false @@ -311,7 +331,7 @@ Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) function push(kmer::Kmer, s) bps = BioSequences.bits_per_symbol(kmer) - newT = derive_type(Kmer{A, length(kmer)+1}) + newT = derive_type(Kmer{A, length(kmer) + 1}) # If no free space in data, add new tuple new_data = if n_unused(typeof(kmer)) < bps (zero(UInt), kmer.data...) @@ -340,7 +360,7 @@ AminoAcid 10-mer KYMLPIIRSF ``` """ -function shift(kmer::Kmer{A}, s) where A +function shift(kmer::Kmer{A}, s) where {A} encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) shift_encoding(kmer, encoding) end @@ -352,9 +372,9 @@ end typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) end -function pushfirst(kmer::Kmer{A}, s) where A +function pushfirst(kmer::Kmer{A}, s) where {A} bps = BioSequences.bits_per_symbol(A()) - newT = derive_type(Kmer{A, length(kmer)+1}) + newT = derive_type(Kmer{A, length(kmer) + 1}) # If no free space in data, add new tuple new_data = if n_unused(typeof(kmer)) < bps (zero(UInt), kmer.data...) @@ -383,12 +403,12 @@ AminoAcid 10-mer FWKYMLPIIR ``` """ -function shift_first(kmer::Kmer{A}, s) where A +function shift_first(kmer::Kmer{A}, s) where {A} encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) shift_first_encoding(kmer, encoding) end -function shift_first_encoding(kmer::Kmer{A}, encoding::UInt) where A +function shift_first_encoding(kmer::Kmer{A}, encoding::UInt) where {A} bps = BioSequences.bits_per_symbol(A()) (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) (head, tail...) = new_data @@ -396,10 +416,10 @@ function shift_first_encoding(kmer::Kmer{A}, encoding::UInt) where A typeof(kmer)(unsafe, (head, tail...)) end -function pop(kmer::Kmer{A}) where A +function pop(kmer::Kmer{A}) where {A} isempty(kmer) && throw(ArgumentError("Cannot pop 0-mer")) bps = BioSequences.bits_per_symbol(A()) - newT = derive_type(Kmer{A, length(kmer)-1}) + newT = derive_type(Kmer{A, length(kmer) - 1}) (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) new_data = if elements_in_head(typeof(kmer)) == 1 (head, tail...) = new_data @@ -413,5 +433,5 @@ end # Get a mask 0x0001111 ... masking away the unused bits of the head element # in the UInt tuple @inline function get_mask(T::Type{<:Kmer}) - UInt(1) << (8*sizeof(UInt) - bits_unused(T)) - 1 + UInt(1) << (8 * sizeof(UInt) - bits_unused(T)) - 1 end diff --git a/src/transformations.jl b/src/transformations.jl index 59d2188..eda9ec7 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -44,23 +44,25 @@ BioSequences.iscanonical(x::Kmer) = x <= reverse_complement(x) function BioSequences.translate( seq::Kmer{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}; - code::BioSequences.GeneticCode = BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool = true, # noop in this method - alternative_start::Bool = false + code::BioSequences.GeneticCode=BioSequences.standard_genetic_code, + allow_ambiguous_codons::Bool=true, # noop in this method + alternative_start::Bool=false, ) n_aa, remainder = divrem(length(seq), 3) - iszero(remainder) || error("LongRNA length is not divisible by three. Cannot translate.") + iszero(remainder) || + error("LongRNA length is not divisible by three. Cannot translate.") N = n_coding_elements(Kmer{AminoAcidAlphabet, n_aa}) T = Kmer{AminoAcidAlphabet, n_aa, N} data = zero_tuple(T) @inbounds for i in 1:n_aa - a = seq[3i-2] - b = seq[3i-1] - c = seq[3i-0] + a = seq[3i - 2] + b = seq[3i - 1] + c = seq[3i - 0] codon = BioSequences.unambiguous_codon(a, b, c) aa = code[codon] carry = UInt(reinterpret(UInt8, aa)) - (_, data) = leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) + (_, data) = + leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) end result = T(unsafe, data) if alternative_start && !iszero(ksize(typeof(seq))) @@ -72,19 +74,20 @@ end function BioSequences.translate( seq::Kmer{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}}; - code::BioSequences.GeneticCode = BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool = true, # noop in this method - alternative_start::Bool = false + code::BioSequences.GeneticCode=BioSequences.standard_genetic_code, + allow_ambiguous_codons::Bool=true, # noop in this method + alternative_start::Bool=false, ) n_aa, remainder = divrem(length(seq), 3) - iszero(remainder) || error("LongRNA length is not divisible by three. Cannot translate.") + iszero(remainder) || + error("LongRNA length is not divisible by three. Cannot translate.") N = n_coding_elements(Kmer{AminoAcidAlphabet, n_aa}) T = Kmer{AminoAcidAlphabet, n_aa, N} data = zero_tuple(T) @inbounds for i in 1:n_aa - a = reinterpret(RNA, seq[3i-2]) - b = reinterpret(RNA, seq[3i-1]) - c = reinterpret(RNA, seq[3i-0]) + a = reinterpret(RNA, seq[3i - 2]) + b = reinterpret(RNA, seq[3i - 1]) + c = reinterpret(RNA, seq[3i - 0]) aa = if isgap(a) | isgap(b) | isgap(c) error("Cannot translate nucleotide sequences with gaps.") elseif iscertain(a) & iscertain(b) & iscertain(c) @@ -93,7 +96,8 @@ function BioSequences.translate( BioSequences.try_translate_ambiguous_codon(code, a, b, c, allow_ambiguous_codons) end carry = UInt(reinterpret(UInt8, aa)) - (_, data) = leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) + (_, data) = + leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) end result = T(unsafe, data) if alternative_start && !iszero(ksize(typeof(seq))) diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index 0858748..659e147 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -106,14 +106,22 @@ end # Shift a tuple left nbits, carry over bits between tuple elements, and OR # the `carry` argument to the right side of the resulting tuple. # Returns (new_carry, new_tuple) -@inline function leftshift_carry(x::Tuple{Vararg{T}}, nbits::Integer, carry::T) where {T <: Unsigned} +@inline function leftshift_carry( + x::Tuple{Vararg{T}}, + nbits::Integer, + carry::T, +) where {T <: Unsigned} head, tail... = x (new_carry, new_tail) = leftshift_carry(tail, nbits, carry) new_head = left_shift(head, nbits) | new_carry (left_carry(head, nbits), (new_head, new_tail...)) end -@inline function rightshift_carry(x::Tuple{Vararg{T}}, nbits::Integer, carry::T) where {T <: Unsigned} +@inline function rightshift_carry( + x::Tuple{Vararg{T}}, + nbits::Integer, + carry::T, +) where {T <: Unsigned} head, tail... = x new_head = right_shift(head, nbits) | right_carry(carry, nbits) mask = left_shift(UInt(1), nbits) - 1 @@ -125,4 +133,3 @@ end # Recusion terminator for above @inline leftshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) @inline rightshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) - diff --git a/test/construction_and_conversion.jl b/test/construction_and_conversion.jl index 68c1465..fb7f47d 100644 --- a/test/construction_and_conversion.jl +++ b/test/construction_and_conversion.jl @@ -91,8 +91,9 @@ global reps = 10 ], ) @test all( - Bool[check_string_construction(AAKmer{len}, random_aa(len)) for _ in 1:reps - ], + Bool[ + check_string_construction(AAKmer{len}, random_aa(len)) for _ in 1:reps + ], ) # Long(DNA|RNA)Seq Constructions diff --git a/test/runtests.jl b/test/runtests.jl index 60ef01d..41394eb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,7 +7,8 @@ using Test include("utils.jl") @testset "BioSequences Interface" begin - for A in [DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4}, AminoAcidAlphabet] + for A in + [DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4}, AminoAcidAlphabet] for K in (1, 9, 116) @test BioSequences.has_interface( BioSequence, @@ -19,8 +20,7 @@ include("utils.jl") end end -@testset "Construction" begin -end +@testset "Construction" begin end # include("construction_and_conversion.jl") # include("comparisons.jl") From db1f16c4cd3958942e441b4b0fa1605a1e27e451 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 3 Oct 2023 18:33:37 +0200 Subject: [PATCH 17/33] More refactoring --- Project.toml | 2 +- src/Kmers.jl | 10 +- src/construction.jl | 253 +++++++++++++++++++++++++++++++++++++++ src/iterators/FwKmers.jl | 130 ++++++++------------ src/iterators/common.jl | 186 +--------------------------- src/kmer.jl | 191 +++-------------------------- src/transformations.jl | 2 +- 7 files changed, 335 insertions(+), 439 deletions(-) create mode 100644 src/construction.jl diff --git a/Project.toml b/Project.toml index 28dacf5..d7e4bed 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,7 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" [compat] BioSequences = "3.1.3" -julia = "1.5" +julia = "1.8" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/Kmers.jl b/src/Kmers.jl index 1689179..5526384 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -127,15 +127,21 @@ Internal trait object used to access unsafe methods of functions. struct Unsafe end const unsafe = Unsafe() +const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} +const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} +const Bytes = Union{String, SubString{String}, AbstractVector{UInt8}} +const BitInteger = Union{Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128} + include("tuple_bitflipping.jl") include("kmer.jl") +include("construction.jl") include("indexing.jl") include("transformations.jl") include("revtrans.jl") include("iterators/common.jl") include("iterators/FwKmers.jl") -include("iterators/CanonicalKmers.jl") -include("iterators/SpacedKmers.jl") +#include("iterators/CanonicalKmers.jl") +#include("iterators/SpacedKmers.jl") end # module diff --git a/src/construction.jl b/src/construction.jl new file mode 100644 index 0000000..732a973 --- /dev/null +++ b/src/construction.jl @@ -0,0 +1,253 @@ +################################################ +# Trait dispatch +################################################ + +"""Trait object which based on static dispatch determines how to recode from +the encoding of the source sequence to the encoding of the kmer""" +abstract type RecodingScheme end + +"We can copy the encoding straight from the source to the kmer" +struct Copyable <: RecodingScheme end + +"We can copy the encoding, then bitshift to create 4-bit encoding" +struct TwoToFour <: RecodingScheme end + +"We skip all symbols in source that contain unmappable symbols" +struct FourToTwo <: RecodingScheme end + +"We can use `BioSequences.ascii_encode`" +struct AsciiEncode <: RecodingScheme end + +"The source is a bytevector, and we have no static knowledge of efficient +conversion to the right encoding" +struct GenericRecoding <: RecodingScheme end + +function RecodingScheme(A::Alphabet, source_type::Type)::RecodingScheme + return if source_type <: BioSequence + if BioSequences.encoded_data_eltype(source_type) <: BitInteger + As = Alphabet(source_type) + if As == A + Copyable() + elseif As isa TwoBit && A isa TwoBit + Copyable() + elseif As isa FourBit && A isa FourBit + Copyable() + elseif As isa FourBit && A isa TwoBit + FourToTwo() + elseif As isa TwoBit && A isa FourBit + TwoToFour() + else + GenericRecoding() + end + else + GenericRecoding() + end + elseif source_type <: Bytes && BioSequences.codetype(A) isa BioSequences.AsciiAlphabet + AsciiEncode() + else + GenericRecoding() + end +end + +################################################ +# Unsafe extract +################################################ + +"Extract a full kmer at a given index of a sequence. +Note: These methods don't do any bounds checking" +function unsafe_extract end + +@inline function unsafe_extract( + ::TwoToFour, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + for i in 1:ksize(T) + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) + (_, data) = leftshift_carry(data, 4, encoding) + end + T(unsafe, data) +end + +@inline function unsafe_extract( + ::FourToTwo, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + for i in 1:ksize(T) + encoding = UInt(BioSequences.extract_encoded_element(seq, i))::UInt + if count_ones(encoding) != 1 + throw( + BioSequences.EncodeError( + Alphabet(T), + reinterpret(eltype(seq), encoding % UInt8), + ), + ) + end + (_, data) = leftshift_carry(data, 2, trailing_zeros(encoding) % UInt) + end + T(unsafe, data) +end + +@inline function unsafe_extract( + ::Copyable, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(seq)) + for i in from_index:(from_index + ksize(T) - 1) + encoding = UInt(BioSequences.extract_encoded_element(seq, i))::UInt + (_, data) = leftshift_carry(data, bps, encoding) + end + T(unsafe, data) +end + +@inline function unsafe_extract( + ::AsciiEncode, + ::Type{T}, + seq::AbstractVector{UInt8}, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(T)) + @inbounds for i in from_index:(from_index + ksize(T) - 1) + byte = seq[i] + encoding = BioSequences.ascii_encode(Alphabet(T), byte) + if encoding > 0x7f + throw(BioSequences.EncodeError(Alphabet(T), repr(byte))) + end + (_, data) = leftshift_carry(data, bps, encoding % UInt) + end + T(unsafe, data) +end + +@inline function unsafe_extract( + ::GenericRecoding, + ::Type{T}, + seq, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(T)) + @inbounds for i in 1:ksize(T) + symbol = convert(eltype(T), seq[i]) + encoding = UInt(BioSequences.encode(Alphabet(T), symbol)) + (_, data) = leftshift_carry(data, bps, encoding) + end + T(unsafe, data) +end + +################################################ +# Constructors with full parameterisation +################################################ + +function Kmer{A, K, N}(x) where {A, K, N} + check_kmer(Kmer{A, K, N}) + build_kmer(RecodingScheme(A(), typeof(x)), Kmer{A, K, N}, x) +end + +# BioSequences support indexing and fast length checks +@inline function build_kmer(R::RecodingScheme, ::Type{T}, s::BioSequence) where T + length(s) == K || error("Length of sequence must be K elements to build Kmer") + unsafe_extract(R, T, s, 1) +end + +# LongSequence with same alphabet: Extract whole coding elements +@inline function build_kmer(R::RecodingScheme, ::Type{Kmer{A, K, N}}, s::LongSequence{A}) where {A, K, N} + length(s) == K || error("Length of sequence must be K elements to build Kmer") + bps = BioSequences.BitsPerSymbol(A()) + data = ntuple(i -> BioSequences.reversebits(@inbounds(s.data[i]), bps), Val{nsize(Kmer{A, K, N})}()) + (_, data) = rightshift_carry(data, bits_unused(T), zero(UInt)) + T(unsafe, data) +end + +# TODO: LongSubSeq: + +# For UTF8-strings combined with an ASCII kmer alphabet, we convert to byte vector +@inline function build_kmer(R::AsciiEncode, ::Type{T}, s::Union{String, SubString{String}}) where T + build_kmer(R, T, codeunits(s)) +end + +# For byte vectors, we can build a kmer iff the kmer alphabet is AsciiAlphabet +@inline function build_kmer(R::AsciiEncode, ::Type{T}, s::AbstractVector{UInt8}) where T + length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") + unsafe_extract(R, T, s, 1) +end + +# The generic fallback - dispatch on whether we can check length once +@inline function build_kmer(R::RecodingScheme, T::Type, s) + build_kmer(Base.IteratorSize(typeof(s)), R, T, s) +end + +@inline function build_kmer(::Base.SizeUnknown, ::RecodingScheme, T::Type, s) + data = zero_tuple(T) + A = Alphabet(T) + bps = BioSequences.bits_per_symbol(A) + i = 0 + for element in itr + i += 1 + i > K && error("Length of sequence must be K elements to build Kmer") + symbol = convert(eltype(A), element) + carry = UInt(BioSequences.encode(A, symbol)) + (_, data) = leftshift_carry(data, bps, carry) + end + i == K || error("Length of sequence must be K elements to build Kmer") + T(unsafe, data) +end + +@inline function build_kmer(::Union{Base.HasLength, Base.HasShape}, ::RecodingScheme, T::Type, s) + length(s) == K || error("Length of sequence must be K elements to build Kmer") + data = zero_tuple(T) + A = Alphabet(T) + bps = BioSequences.bits_per_symbol(A) + for element in itr + symbol = convert(eltype(A), element) + carry = UInt(BioSequences.encode(A, symbol)) + (_, data) = leftshift_carry(data, bps, carry) + end + T(unsafe, data) +end + +################################################ +# Derived constructors +################################################ + +Kmer{A, K}(x) where {A, K} = derive_type(Kmer{A, K})(x) + +function kmer(::Val{K}, s::BioSequence{A}) where {A, K} + K isa Int || error("K must be an Int") + Kmer{A, K}(s) +end + +################################################ +# Construct other types from Kmers +################################################ + +# TODO: LongSequence +# TODO: String + +################################################ +# String literals +################################################ + +macro mer_str(seq, flag) + trimmed = BioSequences.remove_newlines(seq) + ncu = ncodeunits(trimmed) + # Unlike @dna_str, we default to 2-bit alphabets, because kmers + # by convention are usually 2-bit only + if flag == "dna" || flag == "d" + Kmer{DNAAlphabet{2}, ncu}(trimmed) + elseif flag == "rna" || flag == "r" + Kmer{RNAAlphabet{2}, ncu}(trimmed) + elseif flag == "aa" || flag == "a" + Kmer{AminoAcidAlphabet,ncu}(trimmed) + else + error("Invalid type flag: '$(flag)'") + end +end \ No newline at end of file diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index b9dff05..3fff470 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -7,10 +7,6 @@ and the eltype of the iterator is `Kmer{A, K, N}` with the appropriate `N`. Can be constructed more conventiently with the constructors `FwDNAMers{K}(s)` and similar also for `FwRNAMers` and `FwAAMers`. -If `A <: Union{DNAAlphabet{2}, RNAAlphabet{2}}` and -`Alphabet(S) isa Union{DNAAlphabet{4}, RNAAlphabet{4}}`, the iterator skips all -kmers containing symbols not permitted in the 2-bit nucleotide alphabet. - # Examples: ```jldoctest julia> v = collect(FwDNAMers{3}("AGCGTATA")); @@ -35,119 +31,99 @@ end source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S load_source(x::FwKmers) = x.seq -function Base.length(it::FwKmers) - Base.IteratorSize(typeof(it)) == Base.HasLength() || throw(MethodError(length, (it,))) - length(usable_source(it)) - ksize(eltype(it)) + 1 +function Base.length(it::FwKmers{A, K, S}) where {A, K, S} + src = used_source(RecodingScheme(A(), S), it.seq) + length(src) - ksize(eltype(it)) + 1 end # Constructors +FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)} + const FwDNAMers{K, S} = FwKmers{DNAAlphabet{2}, K, S} const FwRNAMers{K, S} = FwKmers{RNAAlphabet{2}, K, S} const FwAAMers{K, S} = FwKmers{AminoAcidAlphabet, K, S} -FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)} - FwDNAMers{K}(s) where {K} = FwDNAMers{K, typeof(s)}(s) FwRNAMers{K}(s) where {K} = FwRNAMers{K, typeof(s)}(s) FwAAMers{K}(s) where {K} = FwAAMers{K, typeof(s)}(s) -function Base.iterate(it::FwKmers, state...) - iterate_kmer(RecodingScheme(typeof(it)), it, state...) +function Base.iterate(it::FwKmers{A, K, S}, state...) where {A, K, S} + iterate_kmer(RecodingScheme(A(), S), it, state...) +end + +# For the first kmer, we just forward to `unsafe_extract` +@inline function iterate_kmer(R::RecodingScheme, it::FwKmers) + length(it.seq) < ksize(eltype(it)) && return nothing + kmer = unsafe_extract(R, eltype(it), it.seq, 1) + (kmer, (kmer, ksize(eltype(it)) + 1)) end -# For these recoding schemes, no symbols in the source sequence are skipped. -# Hence, we can forward to just `extract` +# Here, we need to convert to an abstractvector @inline function iterate_kmer( - R::Union{GenericAlphabet, Copyable, TwoToFour, AsciiEncode, GenericBytes}, - it::FwKmers, -) - src = usable_source(it) + R::AsciiEncode, + it::FwKmers{A, K, S}, +) where {A <: Alphabet, K, S <: Bytes} + src = used_source(RecodingScheme(A(), S), it.seq) + Base.require_one_based_indexing(src) length(src) < ksize(eltype(it)) && return nothing - kmer = extract(R, eltype(it), src, 1) + kmer = unsafe_extract(R, eltype(it), src, 1) (kmer, (kmer, ksize(eltype(it)) + 1)) end -@inline function iterate_kmer(::GenericAlphabet, it::FwKmers, state::Tuple{Kmer, Int}) - src = usable_source(it) +@inline function iterate_kmer(::GenericRecoding, it::FwKmers, state::Tuple{Kmer, Int}) (kmer, i) = state - i > length(src) && return nothing - symbol = @inbounds src[i] + i > length(it.seq) && return nothing + symbol = @inbounds it.seq[i] new_kmer = shift(kmer, convert(eltype(kmer), symbol)) - (new_kmer, (new_kmer, i + 1)) + (new_kmer, (new_kmer, nextind(it.seq, i))) end @inline function iterate_kmer(::Copyable, it::FwKmers, state::Tuple{Kmer, Int}) - src = usable_source(it) (kmer, i) = state - i > length(src) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(src, i)) + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i)) new_kmer = shift_encoding(kmer, encoding) - (new_kmer, (new_kmer, i + 1)) + (new_kmer, (new_kmer, nextind(it.seq, i))) end @inline function iterate_kmer(::TwoToFour, it::FwKmers, state::Tuple{Kmer, Int}) - src = usable_source(it) (kmer, i) = state - i > length(src) && return nothing - encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(src, i))) + i > length(it.seq) && return nothing + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(it.seq, i))) new_kmer = shift_encoding(kmer, encoding) - (new_kmer, (new_kmer, i + 1)) + (new_kmer, (new_kmer, nextind(it.seq, i))) end @inline function iterate_kmer( - ::Skipping, - it::FwKmers{A, K}, - state::Tuple{Kmer, Int, Int}=(zero_kmer(Kmer{A, K}), K, 1), + ::FourToTwo, + it::FwKmers{A, K, <:BioSequence}, + state::Tuple{Kmer, Int}, ) where {A, K} - (kmer, remaining, i) = state - src = usable_source(it) - while !iszero(remaining) - i > length(src) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(src, i)) - i += 1 - remaining = ifelse(isone(count_ones(encoding)), remaining - 1, K) - kmer = shift_encoding(kmer, trailing_zeros(encoding) % UInt) - end - return (kmer, (kmer, 1, i)) -end - -@inline function iterate_kmer(::GenericBytes, it::FwKmers, state::Tuple{Kmer, Int}) - src = usable_source(it) - Base.require_one_based_indexing(src) (kmer, i) = state - i > length(src) && return nothing - char = reinterpret(Char, (src[i] % UInt32) << 24) - symbol = eltype(eltype(it))(char) - kmer = shift(kmer, symbol) - return (kmer, (kmer, i + 1)) + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt + if count_ones(encoding) != 1 + throw( + BioSequences.EncodeError( + Alphabet(kmer), + reinterpret(eltype(it.seq), encoding % UInt8), + ), + ) + end + kmer = shift_encoding(kmer, trailing_zeros(encoding) % UInt) + return (kmer, (kmer, nextind(it.seq, i))) end @inline function iterate_kmer(::AsciiEncode, it::FwKmers, state::Tuple{Kmer, Int}) - src = usable_source(it) + src = used_source(RecodingScheme(Alphabet(eltype(it)), source_type(typeof(it))), it.seq) Base.require_one_based_indexing(src) (kmer, i) = state i > length(src) && return nothing - encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), @inbounds(src[i])) - kmer = shift_encoding(kmer, encoding) - return (kmer, (kmer, i + 1)) -end - -@inline function iterate_kmer( - ::AsciiSkipping, - it::FwKmers{A, K}, - state=(zero_kmer(Kmer{A, K}), K, 1), -) where {A, K} - (kmer, remaining, i) = state - src = usable_source(it) - Base.require_one_based_indexing(src) - while !iszero(remaining) - i > length(src) && return nothing - byte = @inbounds src[i] - i += 1 - encoding = @inbounds ASCII_SKIPPING_LUT[byte + 0x01] - encoding == 0xff && throw_bad_byte_error(byte) - remaining = ifelse(encoding == 0xf0, K, remaining - 1) - kmer = shift_encoding(kmer, encoding % UInt) + byte = @inbounds src[i] + encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), byte) + if encoding > 0x7f + throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) end - return (kmer, (kmer, 1, i)) + kmer = shift_encoding(kmer, encoding % UInt) + return (kmer, (kmer, nextind(src, i))) end diff --git a/src/iterators/common.jl b/src/iterators/common.jl index dcba46d..e680bde 100644 --- a/src/iterators/common.jl +++ b/src/iterators/common.jl @@ -15,10 +15,6 @@ function Base.eltype(::Type{<:AbstractKmerIterator{A, K}}) where {A, K} Kmer{A, K, n_coding_elements(Kmer{A, K})} end -const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} -const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} -const ByteSource = Union{String, SubString{String}, AbstractVector{UInt8}} - """ source_type(::Type{<:AbstractKmerIterator})::Type @@ -33,100 +29,11 @@ Get the data source from the kmer iterator. """ function load_source end -""" -usable_source(x::AbstractKmerIterator) - -Convert the source object into whatever is used by the iterator protocol -""" -function usable_source(x::AbstractKmerIterator)::Union{BioSequence, AbstractVector{UInt8}} - loaded = load_source(x) - return if loaded isa Union{BioSequence, AbstractVector{UInt8}} - loaded - elseif loaded isa Union{String, SubString{String}} - codeunits(loaded) - else - error("Does not know how to load data from source") - end -end - -# We can't compute the length for -# * Unknown alphabets, or unknown source types -# * A source type that can contain ambiguous nucleotides, -# while the kmer type does not. In this case, it's standard practise to -# skip these symbols. -function Base.IteratorSize(::Type{T}) where {T <: AbstractKmerIterator} - kT = eltype(T) - sT = source_type(T) - A = Alphabet(kT) - return if sT <: BioSequence && Alphabet(sT) == A - Base.HasLength() - elseif sT <: BioSequence && Alphabet(sT) isa TwoBit && A isa FourBit - Base.HasLength() - elseif sT <: ByteSource && A isa Union{FourBit, AminoAcidAlphabet} - Base.HasLength() - else - Base.SizeUnknown() - end -end - -"""Trait object which based on static dispatch determines how to recode from -the encoding of the source sequence to the encoding of the kmer""" -abstract type RecodingScheme end - -"We can copy the encoding straight from the source to the kmer" -struct Copyable <: RecodingScheme end - -"We can copy the encoding, then bitshift to create 4-bit encoding" -struct TwoToFour <: RecodingScheme end - -"We skip all symbols in source that contain unmappable symbols" -struct Skipping <: RecodingScheme end - -"We can use `BioSequences.ascii_encode`" -struct AsciiEncode <: RecodingScheme end - -"The source is bytes, but we need our own encoding table, -since we must skip ambiguous nucleotides" -struct AsciiSkipping <: RecodingScheme end - -"The source is a bytevector, and we have no static knowledge of efficient -conversion to the right encoding" -struct GenericBytes <: RecodingScheme end - -"Generic fallback when the source is a `BioSequence`" -struct GenericAlphabet <: RecodingScheme end - -function RecodingScheme(::Type{T})::RecodingScheme where {T <: AbstractKmerIterator} - A = Alphabet(eltype(T)) - sT = source_type(T) - return if sT <: BioSequence - As = Alphabet(sT) - if As == A - Copyable() - elseif As isa TwoBit && A isa TwoBit - Copyable() - elseif As isa FourBit && A isa FourBit - Copyable() - elseif As isa FourBit && A isa TwoBit - Skipping() - elseif As isa TwoBit && A isa FourBit - TwoToFour() - else - GenericAlphabet() - end - elseif sT <: ByteSource - codetype = BioSequences.codetype(A) - return if codetype isa BioSequences.AsciiAlphabet - if A isa TwoBit - AsciiSkipping() - else - AsciiEncode() - end - else - return GenericBytes() - end +function used_source(R::RecodingScheme, s) + if R isa AsciiEncode && s isa Union{String, SubString{String}} + codeunits(s) else - error("Cannot determine recoding scheme of iterator") + s end end @@ -152,88 +59,3 @@ const FOURBIT_COMPLEMENT_LUT = let end Tuple(v) end - -"Extract a full kmer at a given index of a sequence. -Note: These methods don't do any bounds checking" -function extract end -# TODO: Use extract elsewhere in this code base, e.g. kmer from string instantiation? - -@inline function extract( - ::GenericAlphabet, - ::Type{T}, - seq::BioSequence, - from_index, -) where {T <: Kmer} - length(seq) < ksize(T) && return nothing - data = zero_tuple(T) - A = Alphabet(T) - bps = BioSequences.bits_per_symbol(A) - @inbounds for i in 1:ksize(T) - symbol = seq[i] - encoding = UInt(BioSequences.encode(A, convert(eltype(A), symbol)))::UInt - (_, data) = leftshift_carry(data, bps, encoding) - end - T(unsafe, data) -end - -@inline function extract( - ::TwoToFour, - ::Type{T}, - seq::BioSequence, - from_index, -) where {T <: Kmer} - length(seq) < ksize(T) && return nothing - data = zero_tuple(T) - for i in 1:ksize(T) - encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) - (_, data) = leftshift_carry(data, 4, encoding) - end - T(unsafe, data) -end - -@inline function extract( - ::Copyable, - ::Type{T}, - seq::BioSequence, - from_index, -) where {T <: Kmer} - data = zero_tuple(T) - bps = BioSequences.bits_per_symbol(Alphabet(seq)) - for i in from_index:(from_index + ksize(T) - 1) - encoding = UInt(BioSequences.extract_encoded_element(seq, i)) - (_, data) = leftshift_carry(data, bps, encoding) - end - T(unsafe, data) -end - -@inline function extract( - ::AsciiEncode, - ::Type{T}, - seq::AbstractVector{UInt8}, - from_index, -) where {T <: Kmer} - data = zero_tuple(T) - bps = BioSequences.bits_per_symbol(Alphabet(kT)) - @inbounds for i in from_index:(from_index + ksize(T) - 1) - encoding = BioSequences.ascii_encode(Alphabet(T), seq[i]) - (_, data) = leftshift_carry(data, bps, encoding) - end - T(unsafe, data) -end - -@inline function extract( - ::GenericBytes, - ::Type{T}, - seq::AbstractVector{UInt8}, - from_index, -) where {T <: Kmer} - data = zero_tuple(T) - bps = BioSequences.bits_per_symbol(Alphabet(T)) - @inbounds for i in 1:ksize(T) - char = reinterpret(Char, (seq[i] % UInt32) << 24) - symbol = eltype(T)(char) - encoding = UInt(BioSequences.encode(Alphabet(T), symbol))::UInt - (_, data) = leftshift_carry(data, bps, encoding) - end - T(unsafe, data) -end diff --git a/src/kmer.jl b/src/kmer.jl index 5eb940d..38e3cef 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -117,10 +117,6 @@ end @inline derive_type(::Type{Kmer{A, K}}) where {A, K} = Kmer{A, K, n_coding_elements(Kmer{A, K})} -################################################ -# Constructors -################################################ - zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), Val{nsize(T)}()) # TODO: Should this somehow throw a MethodError if N is already parameterized? @@ -129,173 +125,6 @@ function zero_kmer(T::Type{Kmer{A, K}}) where {A, K} T2(unsafe, zero_tuple(T2)) end -# Generic, unknown size -@inline function construct_generic( - ::Base.SizeUnknown, - T::Type{<:Kmer{A, K}}, - itr, -) where {A, K} - check_kmer(T) - data = zero_tuple(T) - nbits = BioSequences.bits_per_symbol(A()) - for (i, element) in enumerate(itr) - i > K && error("Length of sequence must be K elements to build Kmer") - symbol = convert(eltype(A), element) - carry = UInt(BioSequences.encode(A(), symbol)) - (_, data) = leftshift_carry(data, nbits, carry) - end - T(unsafe, data) -end - -# Generic, size known -@inline function construct_generic_unchecked( - ::Union{Base.HasLength, Base.HasShape}, - T::Type{<:Kmer{A}}, - itr, -) where {A} - check_kmer(T) - data = zero_tuple(T) - nbits = BioSequences.bits_per_symbol(A()) - for element in itr - symbol = convert(eltype(A), element) - carry = UInt(BioSequences.encode(A(), symbol)) - (_, data) = leftshift_carry(data, nbits, carry) - end - T(unsafe, data) -end - -# Generic, size known but length not checked. -@inline function construct_generic( - iT::Union{Base.HasLength, Base.HasShape}, - T::Type{<:Kmer{A, K}}, - itr, -) where {A, K} - length(itr) == K || error("Length of sequence must be K elements to build Kmer") - construct_generic_unchecked(iT, T, itr) -end - -# BioSequences with the same Alphabet and these element types do not need to decode -# and encode, but can copy the raw bits directly into the kmer -@inline function construct_unchecked( - T::Type{<:Kmer{A}}, - s::BioSequence{A}, - data_eltype::Type{E}, -) where {A <: Alphabet, E <: Union{UInt8, UInt16, UInt32, UInt}} - check_kmer(T) - data = zero_tuple(T) - nbits = BioSequences.bits_per_symbol(A()) - for i in 1:ksize(T) - (_, data) = - leftshift_carry(data, nbits, BioSequences.extract_encoded_element(s, i) % UInt) - end - T(unsafe, data) -end - -# With LongSequence of the same alphabet, entire coding elements can be copied -# directly. -# TODO: Test that LongSequence and LongSubSeq encoded_data_eltype is UInt -@inline function construct_unchecked( - T::Type{<:Kmer{A}}, - s::LongSequence{A}, - data_eltype::Type{UInt}, -) where {A <: Alphabet} - check_kmer(T) - Bps = BioSequences.BitsPerSymbol(A()) - data = ntuple(i -> BioSequences.reversebits(@inbounds(s.data[i]), Bps), Val{nsize(T)}()) - (_, data) = rightshift_carry(data, bits_unused(T), zero(UInt)) - T(unsafe, data) -end - -# BioSequence with another element type fall back to the generic length constructor -@inline function construct_unchecked(T::Type{<:Kmer}, s::BioSequence, data_eltype::Type) - construct_generic_unchecked(Base.HasLength(), T, s) -end - -# BioSequence must implement length so we don't need to dispatch on that. -# However, if the encoded data eltype is an unsigned, we can use a specialized method where we don't -# decode each symbol but simply move the encoded data directly into the tuple -function Kmer{A, K, N}(s::BioSequence) where {A, K, N} - length(s) == K || error("Length of sequence must be K elements to build Kmer") - construct_unchecked(Kmer{A, K, N}, s, BioSequences.encoded_data_eltype(typeof(s))) -end - -# Generic constructor: Dispatch on the iteratorsize -function Kmer{A, K, N}(itr) where {A, K, N} - construct_generic(Base.IteratorSize(typeof(itr)), Kmer{A, K, N}, itr) -end - -# To avoid having the O(N) length check. TODO: Use optimised method -function Kmer{A, K, N}(s::Union{String, SubString{String}}) where {A, K, N} - construct_generic(Base.SizeUnknown(), Kmer{A, K, N}, s) -end - -################################################ -# Derived constructors -################################################ - -# BioSequence: Various missing type parameters -Kmer{A, K}(s::BioSequence) where {A, K} = derive_type(Kmer{A, K})(s) -Kmer{A}(s::BioSequence) where {A} = derive_type(Kmer{A, length(s)})(s) -Kmer(s::BioSequence{A}) where {A} = derive_type(Kmer{A, length(s)})(s) - -# Iterators: Various missing type parameters. -# It's too impractical to construct a kmer before we know the value of K, -# so either the iterator must have a known length, or else we need to collect -# it first -Kmer{A, K}(itr) where {A, K} = Kmer{A, K}(Base.IteratorSize(itr), itr) -Kmer{A, K}(::Base.SizeUnknown, itr) where {A, K} = Kmer{A, K}(collect(itr)) - -function Kmer{A, K}(iT::Union{Base.HasLength, Base.HasShape}, itr) where {A, K} - length(itr) == K || error("Length of sequence must be K elements to build Kmer") - construct_generic_unchecked(iT, derive_type(Kmer{A, K}), itr) -end - -Kmer{A}(itr) where {A} = Kmer{A}(Base.IteratorSize(itr), itr) -Kmer{A}(::Base.SizeUnknown, itr) where {A} = Kmer{A}(vec(collect(itr))) - -function Kmer{A}(iT::Union{Base.HasLength, Base.HasShape}, itr) where {A} - construct_generic_unchecked(iT, derive_type(Kmer{A, length(itr)}), itr) -end - -# Strings: Various missing type parameters -function Kmer{A}(s::Union{String, SubString{String}}) where {A} - construct_generic_unchecked(Base.HasLength(), derive_type(Kmer{A, length(s)}), s) -end - -function Kmer{A, K}(s::Union{String, SubString{String}}) where {A, K} - length(s) == K || error("Length of sequence must be K elements to build Kmer") - construct_generic_unchecked(Base.HasLength(), derive_type(Kmer{A, K}), s) -end - -# TODO: Constructor from LongSubSeq -# where whole coding elements can be copied directly over -# without extracting individual elements - -# TODO: Kmer => LongSequence constructor, same as above but opposite, kinda. - -# TODO: Constructor from String that predicts the alphabet? -# Maybe implement the guessparse function in BioSequences.jl -# (See related issue), then call it from here. - -################################################ -# String literals -################################################ - -macro mer_str(seq, flag) - trimmed = BioSequences.remove_newlines(seq) - # Unlike @dna_str, we default to 2-bit alphabets, because kmers - # by convention are usually 2-bit only - if flag == "dna" || flag == "d" - Kmer{DNAAlphabet{2}}(trimmed) - elseif flag == "rna" || flag == "r" - Kmer{RNAAlphabet{2}}(trimmed) - elseif flag == "aa" || flag == "a" - Kmer{AminoAcidAlphabet}(trimmed) - else - error("Invalid type flag: '$(flag)'") - end -end - ################## # Various methods ################## @@ -321,12 +150,22 @@ function Base.print(io::IO, s::Kmer) print(io, LongSequence(s)) end -Base.cmp(x::T, y::T) where {T <: Kmer} = cmp(x.data, y.data) -Base.:(==)(x::Kmer{A}, y::Kmer{A}) where {A} = x.data == y.data -Base.isless(x::T, y::T) where {T <: Kmer} = isless(x.data, y.data) +function Base.cmp(x::Kmer{A, K1}, y::Kmer{A, K2}) where {A, K1, K2} + if K1 < K2 + -1 + elseif K2 < K1 + 1 + else + cmp(x.data, y.data) + end +end + +Base.isless(x::Kmer, y::Kmer) = cmp(x, y) == -1 +Base.:(==)(x::Kmer, y::Kmer) = iszero(cmp(x, y)) + +Base.:(==)(x::Kmer, y::BioSequence) = throw(MethodError(==, (x, y))) +Base.:(==)(x::BioSequence, y::Kmer) = throw(MethodError(==, (x, y))) -Base.isequal(x::Kmer, y::BioSequence) = false -Base.isequal(x::BioSequence, y::Kmer) = false Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) function push(kmer::Kmer, s) diff --git a/src/transformations.jl b/src/transformations.jl index eda9ec7..a142df6 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -32,7 +32,7 @@ end # TODO: Should this be the generic BioSequence def in BioSequences.jl? function BioSequences.reverse_complement(x::Kmer) - reverse(complement(x)) + @inline(reverse(@inline(complement(x)))) end function BioSequences.canonical(x::Kmer) From c7e5e80117d8e2586b8f14cfbca2963d39ae888c Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Thu, 28 Dec 2023 17:30:01 +0100 Subject: [PATCH 18/33] Start tests --- Project.toml | 5 +- src/Kmers.jl | 5 +- src/construction.jl | 46 +++++---- src/kmer.jl | 21 ++-- test/runtests.jl | 229 +++++++++++++++++++++++++++++++++++++++++++- test/utils.jl | 4 + 6 files changed, 281 insertions(+), 29 deletions(-) diff --git a/Project.toml b/Project.toml index d7e4bed..160e09d 100644 --- a/Project.toml +++ b/Project.toml @@ -5,13 +5,16 @@ version = "0.1.0" [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" [compat] BioSequences = "3.1.3" +Random = "1.10" julia = "1.8" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [targets] -test = ["Test"] +test = ["Test", "Random"] diff --git a/src/Kmers.jl b/src/Kmers.jl index 5526384..c50f987 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -20,7 +20,7 @@ export Kmer, # Immutable operations push, - pushfirst, + push_first, shift, shift_first, @@ -130,7 +130,8 @@ const unsafe = Unsafe() const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} const Bytes = Union{String, SubString{String}, AbstractVector{UInt8}} -const BitInteger = Union{Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128} +const BitInteger = + Union{Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128} include("tuple_bitflipping.jl") include("kmer.jl") diff --git a/src/construction.jl b/src/construction.jl index 732a973..a2430cf 100644 --- a/src/construction.jl +++ b/src/construction.jl @@ -153,29 +153,33 @@ function Kmer{A, K, N}(x) where {A, K, N} end # BioSequences support indexing and fast length checks -@inline function build_kmer(R::RecodingScheme, ::Type{T}, s::BioSequence) where T - length(s) == K || error("Length of sequence must be K elements to build Kmer") +@inline function build_kmer(R::RecodingScheme, ::Type{T}, s::BioSequence) where {T} + length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") unsafe_extract(R, T, s, 1) end -# LongSequence with same alphabet: Extract whole coding elements -@inline function build_kmer(R::RecodingScheme, ::Type{Kmer{A, K, N}}, s::LongSequence{A}) where {A, K, N} - length(s) == K || error("Length of sequence must be K elements to build Kmer") - bps = BioSequences.BitsPerSymbol(A()) - data = ntuple(i -> BioSequences.reversebits(@inbounds(s.data[i]), bps), Val{nsize(Kmer{A, K, N})}()) +# LongSequence with compatible alphabet: Extract whole coding elements +@inline function build_kmer(R::Copyable, ::Type{T}, s::LongSequence) where {T} + length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") + bps = BioSequences.BitsPerSymbol(Alphabet(T)) + data = ntuple(i -> BioSequences.reversebits(@inbounds(s.data[i]), bps), Val{nsize(T)}()) (_, data) = rightshift_carry(data, bits_unused(T), zero(UInt)) T(unsafe, data) end -# TODO: LongSubSeq: +# TODO: LongSubSeq with compatible alphabet # For UTF8-strings combined with an ASCII kmer alphabet, we convert to byte vector -@inline function build_kmer(R::AsciiEncode, ::Type{T}, s::Union{String, SubString{String}}) where T +@inline function build_kmer( + R::AsciiEncode, + ::Type{T}, + s::Union{String, SubString{String}}, +) where {T} build_kmer(R, T, codeunits(s)) end # For byte vectors, we can build a kmer iff the kmer alphabet is AsciiAlphabet -@inline function build_kmer(R::AsciiEncode, ::Type{T}, s::AbstractVector{UInt8}) where T +@inline function build_kmer(R::AsciiEncode, ::Type{T}, s::AbstractVector{UInt8}) where {T} length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") unsafe_extract(R, T, s, 1) end @@ -190,23 +194,28 @@ end A = Alphabet(T) bps = BioSequences.bits_per_symbol(A) i = 0 - for element in itr + for element in s i += 1 - i > K && error("Length of sequence must be K elements to build Kmer") + i > ksize(T) && error("Length of sequence must be K elements to build Kmer") symbol = convert(eltype(A), element) carry = UInt(BioSequences.encode(A, symbol)) (_, data) = leftshift_carry(data, bps, carry) end - i == K || error("Length of sequence must be K elements to build Kmer") + i == ksize(T) || error("Length of sequence must be K elements to build Kmer") T(unsafe, data) end -@inline function build_kmer(::Union{Base.HasLength, Base.HasShape}, ::RecodingScheme, T::Type, s) - length(s) == K || error("Length of sequence must be K elements to build Kmer") +@inline function build_kmer( + ::Union{Base.HasLength, Base.HasShape}, + ::RecodingScheme, + T::Type, + s, +) + length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") data = zero_tuple(T) A = Alphabet(T) bps = BioSequences.bits_per_symbol(A) - for element in itr + for element in s symbol = convert(eltype(A), element) carry = UInt(BioSequences.encode(A, symbol)) (_, data) = leftshift_carry(data, bps, carry) @@ -219,6 +228,7 @@ end ################################################ Kmer{A, K}(x) where {A, K} = derive_type(Kmer{A, K})(x) +Kmer{A1}(x::Kmer{A2, K, N}) where {A1, A2, K, N} = Kmer{A1, K, N}(x) function kmer(::Val{K}, s::BioSequence{A}) where {A, K} K isa Int || error("K must be an Int") @@ -246,8 +256,8 @@ macro mer_str(seq, flag) elseif flag == "rna" || flag == "r" Kmer{RNAAlphabet{2}, ncu}(trimmed) elseif flag == "aa" || flag == "a" - Kmer{AminoAcidAlphabet,ncu}(trimmed) + Kmer{AminoAcidAlphabet, ncu}(trimmed) else error("Invalid type flag: '$(flag)'") end -end \ No newline at end of file +end diff --git a/src/kmer.jl b/src/kmer.jl index 38e3cef..0e6dee9 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -74,7 +74,7 @@ This function should compile to a noop in case the parameterization is good. @inline function check_kmer(::Type{Kmer{A, K, N}}) where {A, K, N} if !(K isa Int) throw(ArgumentError("K must be an Int")) - elseif K < 1 + elseif K < 0 throw(ArgumentError("Bad kmer parameterisation. K must be greater than 0.")) end n = cld((K * BioSequences.bits_per_symbol(A())) % UInt, (sizeof(UInt) * 8) % UInt) % Int @@ -94,7 +94,8 @@ end @inline ksize(::Type{<:Kmer{A, K}}) where {A, K} = K @inline nsize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = N @inline n_unused(::Type{<:Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K -@inline bits_unused(T::Type{<:Kmer}) = n_unused(T) * BioSequences.bits_per_symbol(T) +@inline bits_unused(T::Type{<:Kmer}) = + n_unused(T) * BioSequences.bits_per_symbol(Alphabet(T)) @inline BioSequences.Alphabet(::Kmer{A}) where {A} = A() @@ -150,7 +151,7 @@ function Base.print(io::IO, s::Kmer) print(io, LongSequence(s)) end -function Base.cmp(x::Kmer{A, K1}, y::Kmer{A, K2}) where {A, K1, K2} +@inline function _cmp(x::Kmer{A1, K1}, y::Kmer{A2, K2}) where {A1, A2, K1, K2} if K1 < K2 -1 elseif K2 < K1 @@ -160,6 +161,13 @@ function Base.cmp(x::Kmer{A, K1}, y::Kmer{A, K2}) where {A, K1, K2} end end +# Here, we don't allow comparing twobit to fourbit sequences. We could do this semantically, +# but this would open a whole can of worms, be impossible to optimise and defeat the purpose +# of using Kmers. +Base.cmp(x::Kmer{A}, y::Kmer{A}) where {A} = _cmp(x, y) +Base.cmp(x::Kmer{<:FourBit}, y::Kmer{<:FourBit}) = _cmp(x, y) +Base.cmp(x::Kmer{<:TwoBit}, y::Kmer{<:TwoBit}) = _cmp(x, y) + Base.isless(x::Kmer, y::Kmer) = cmp(x, y) == -1 Base.:(==)(x::Kmer, y::Kmer) = iszero(cmp(x, y)) @@ -170,7 +178,8 @@ Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) function push(kmer::Kmer, s) bps = BioSequences.bits_per_symbol(kmer) - newT = derive_type(Kmer{A, length(kmer) + 1}) + A = Alphabet(kmer) + newT = derive_type(Kmer{typeof(A), length(kmer) + 1}) # If no free space in data, add new tuple new_data = if n_unused(typeof(kmer)) < bps (zero(UInt), kmer.data...) @@ -178,7 +187,7 @@ function push(kmer::Kmer, s) kmer.data end # leftshift_carry the new encoding in. - encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + encoding = UInt(BioSequences.encode(A, convert(eltype(kmer), s))) (_, new_data) = leftshift_carry(new_data, bps, encoding) newT(unsafe, new_data) end @@ -211,7 +220,7 @@ end typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) end -function pushfirst(kmer::Kmer{A}, s) where {A} +function push_first(kmer::Kmer{A}, s) where {A} bps = BioSequences.bits_per_symbol(A()) newT = derive_type(Kmer{A, length(kmer) + 1}) # If no free space in data, add new tuple diff --git a/test/runtests.jl b/test/runtests.jl index 41394eb..073ae1a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,8 +1,10 @@ module TestKmers +using Test +using Random using Kmers using BioSequences -using Test +using BioSymbols include("utils.jl") @@ -20,7 +22,230 @@ include("utils.jl") end end -@testset "Construction" begin end +struct CharSymbol <: BioSymbol + x::Char +end +BioSymbols.prefix(::CharSymbol) = "Char" +BioSymbols.type_text(::CharSymbol) = "CharSymbol" +BioSymbols.isterm(::CharSymbol) = false + +# These two are not interface, but useful for the tests below +# (e.g. converting the sequence to a string) +Base.convert(::Type{Char}, x::CharSymbol) = x.x +Base.Char(x::CharSymbol) = convert(Char, x) +Base.convert(::Type{CharSymbol}, x::Char) = CharSymbol(x) +BioSymbols.isgap(::CharSymbol) = false + +# TODO: Should BioSymbols be updated to remove this? +BioSymbols.isvalid(::CharSymbol) = true + +struct CharAlphabet <: Alphabet end +Base.eltype(::Type{CharAlphabet}) = CharSymbol +BioSequences.symbols(::CharAlphabet) = ntuple(i -> CharSymbol(Char(i - 1)), Val{128}()) +BioSequences.encode(::CharAlphabet, c::CharSymbol) = reinterpret(UInt32, c.x) % UInt +BioSequences.decode(::CharAlphabet, c::UInt) = CharSymbol(reinterpret(Char, c % UInt32)) +BioSequences.BitsPerSymbol(::CharAlphabet) = BioSequences.BitsPerSymbol{32}() + +const ALPHABETS = [ + DNAAlphabet{2}(), + RNAAlphabet{2}(), + DNAAlphabet{4}(), + RNAAlphabet{4}(), + AminoAcidAlphabet(), + CharAlphabet(), +] + +@testset "Construction" begin + # Fundamentals + dna = dna"TAGCTAAC" + mer = Kmer{DNAAlphabet{2}, length(dna)}(dna) + @test mer isa Kmer{DNAAlphabet{2}, length(dna)} + @test DNACodon == DNAKmer{3, 1} + @test RNACodon == RNAKmer{3, 1} + + for A in ALPHABETS + Ta = typeof(A) + for L in [0, 3, 11, 41] + for i in 1:3 + # Fundamentals and length + s = randseq(A, SamplerUniform(symbols(A)), L) + mer = Kmer{Ta, L}(collect(s)) + @test mer isa Kmer{Ta, L} + @test length(mer) == L + @test string(mer) == string(s) + + # From string + mer2 = Kmer{Ta, L}(string(s)) + @test mer == mer2 + @test mer === mer2 + @test string(mer) == string(mer2) + + # From LongSequence + mer3 = Kmer{Ta, L}(s) + @test mer === mer3 + end + end + end + + # Construct from string + @testset "Construct from string" begin + for s in ["TAG", "ACCGAGCC", "TGATGCTATTAGG"] + L = length(s) + for ss in (s, view(s, 1:lastindex(s))) + @test DNAKmer{L, 1}(ss) == DNAKmer{L}(ss) + @test string(DNAKmer{L}(ss)) == ss + end + end + + s = "UHVKALRIQURPFLSMOF" + @test string(AAKmer{18}(s)) == s + + for s in ["αβγδϵ", "", "中国人大网"] + L = length(s) + for ss in (s, view(s, 1:lastindex(s))) + sq = Kmer{CharAlphabet, L}(ss) + @test string(sq) == s + @test [Char(i) for i in sq] == collect(ss) + end + end + + # Wrong length - also for iterators of unknown size + end + + @testset "Wrong length" begin + @test_throws Exception DNAKmer{4}("TAC") + @test_throws Exception DNAKmer{4}("TACCA") + @test_throws Exception Kmer{CharAlphabet, 2}(['T']) + @test_throws Exception AAMer{3}((AminoAcid(i) for i in "WPLK" if true)) + end + + @testset "Length must be given explicitly" begin + for s in ["TACA", ""] + @test_throws Exception DNAKmer("TACGA") + @test string(DNAKmer{length(s)}(s)) == s + end + @test_throws Exception AAMer(aa"WPLKM") + @test collect(AAKmer{5}(aa"WPLKM")) == collect(aa"WPLKM") + end + + @testset "Kmer literal" begin + @test collect(mer"TGAGTCA"d) == collect(dna"TGAGTCA") + @test collect(mer"WQOPMKAP"a) == collect(aa"WQOPMKAP") + @test collect(mer"UAUCGGAUC"r) == collect(rna"UAUCGGAUC") + end + + @testset "Construct from Biosequences" begin + @testset "Construct from LongSequence" begin + for seq in [ + dna"TAGGCA", + rna"UUCUGUGAGUCC", + aa"TTCGGAA", + LongSequence{CharAlphabet}("HELLO"), + ] + for sq in [seq, view(seq, 2:lastindex(seq))] + A = typeof(Alphabet(sq)) + @test Kmer{A, length(sq)}(sq) == Kmer{A, length(sq)}(string(sq)) + @test string(Kmer{A, length(sq)}(sq)) == string(sq) + @test_throws Exception Kmer{A, length(sq) + 1}(sq) + end + end + end + + @testset "Construct from kmer" begin + m = mer"TAGCGTTA"d + m2 = DNAKmer{8}(m) + @test m === m2 + @test_throws Exception DNAKmer{7}(m) + m3 = RNAKmer{8}(m) + @test m3 === mer"UAGCGUUA"r + @test_throws Exception RNAKmer{9}(m) + @test_throws Exception AAKmer{8}(m) + end + + # From generic biosequence - TODO + end + + @testset "Construct from iterable" begin + m1 = DNAKmer{6}((i for i in dna"GCGATC")) + m2 = DNAKmer{6}((i for i in dna"ATCGATGCAA" if i ∈ (DNA_A, DNA_C))) + @test m1 === mer"GCGATC"d + @test m2 === mer"ACACAA"d + m3 = DNAKmer{4}((i for i in rna"GAUC" if true)) + @test m3 === mer"GATC"d + end +end + +@testset "Comparison" begin + @testset "Equality" begin + @test mer"KMNUPQCX"a == mer"KMNUPQCX"a + @test mer"PKMNEA"a != mer"PKMNE"a + @test mer"IUDHLDJVIPOEJKWE"a != mer"IUDHLDJVIPOEJKW"a + end + + @testset "Ordering" begin + @test mer"UGCAG"r > mer"CGCAG"r + @test mer"TCGGAAG"d > mer"TCGGAAC"d + @test mer"OEWPM"a > mer"OEWP"a + @test mer"UGCGA"r > mer"TGAGA"d + end + + @testset "Hashing, isless and isequal" begin + @test hash(mer"POSMDGF"a, UInt(15)) === hash(mer"POSMDGF"a, UInt(15)) + @test isequal(mer"POSMDGF"a, mer"POSMDGF"a) + + # Same, but DNA/RNA + m1 = mer"TAGCTA"d + m2 = mer"UAGCUA"r + @test isequal(m1, m2) + @test hash(m1) === hash(m2) + m3 = Kmer{DNAAlphabet{4}}(m1) + m4 = Kmer{RNAAlphabet{4}}(m2) + @test isequal(m3, m4) + @test hash(m3) === hash(m4) + + # Other kmers + # This throws because we want kmer hashing to be maximally fast, + # which implies they must have a different hashing strategy from + # other BioSequences, which implies they can't be isequal + @test_throws Exception isequal(mer"UGCUGA"r, mer"UGCUGA"a) + @test !isequal(mer"UGCAC"r, mer"UGCGA"r) + + # Other sequences + @test_throws Exception dna"TAG" == mer"TAG"d + end +end + +@testset "Access" begin + # Scalar indexing + + # Index with UnitRange + + # Index with vector of indices + + # Boolean indexing +end + +@testset "Modification" begin + # Push, pushfirst + + # Shift, shift_first + + # Pop +end + +@testset "Biological operations" begin + # Reverse + + # Complement + + # Reverse complement + + # Canonical +end + +@testset "Translation" begin end + +@testset "Iterators" begin end # include("construction_and_conversion.jl") # include("comparisons.jl") diff --git a/test/utils.jl b/test/utils.jl index 9dffd1b..867ebca 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,3 +1,7 @@ +function random_seq(A::Alphabet, n::Integer) + randseq(A, SamplerUniform(symbols(A)), n) +end + # Return a random DNA/RNA sequence of the given length. function random_seq(n::Integer, nts, probs, outtype=String) cumprobs = cumsum(probs) From 2912114a1612f9f2967c9828d33b1a3e98098276 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 29 Dec 2023 08:59:00 +0100 Subject: [PATCH 19/33] More tests --- src/Kmers.jl | 6 + src/construction.jl | 3 +- src/indexing.jl | 14 ++ src/iterators/FwKmers.jl | 4 +- src/kmer.jl | 12 +- src/transformations.jl | 2 +- test/runtests.jl | 317 +++++++++++++++++++++++++++++++++++++-- test/utils.jl | 4 - 8 files changed, 331 insertions(+), 31 deletions(-) diff --git a/src/Kmers.jl b/src/Kmers.jl index c50f987..e82ac10 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -8,6 +8,7 @@ module Kmers export Kmer, + Mer, DNAKmer, RNAKmer, AAKmer, @@ -23,6 +24,7 @@ export Kmer, push_first, shift, shift_first, + pop, # Iterators FwKmers, @@ -30,6 +32,10 @@ export Kmer, FwRNAMers, FwAAMers, + # Reverse translation + CodonSet, + delete, # push already exported + ################## # Re-exports ################## diff --git a/src/construction.jl b/src/construction.jl index a2430cf..041d19b 100644 --- a/src/construction.jl +++ b/src/construction.jl @@ -159,7 +159,7 @@ end end # LongSequence with compatible alphabet: Extract whole coding elements -@inline function build_kmer(R::Copyable, ::Type{T}, s::LongSequence) where {T} +@inline function build_kmer(::Copyable, ::Type{T}, s::LongSequence) where {T} length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") bps = BioSequences.BitsPerSymbol(Alphabet(T)) data = ntuple(i -> BioSequences.reversebits(@inbounds(s.data[i]), bps), Val{nsize(T)}()) @@ -168,6 +168,7 @@ end end # TODO: LongSubSeq with compatible alphabet +# Note: LongSequence may be UInt64 whereas kmers use UInt32 # For UTF8-strings combined with an ASCII kmer alphabet, we convert to byte vector @inline function build_kmer( diff --git a/src/indexing.jl b/src/indexing.jl index 58028ad..99ca6ac 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -38,6 +38,20 @@ end T(unsafe, data) end +function Base.getindex(kmer::Kmer{A}, indices::AbstractVector{<:Integer}) where {A} + K = length(indices) + N = n_coding_elements(Kmer{A, K}) + T = Kmer{A, K, N} + data = zero_tuple(T) + nbits = BioSequences.bits_per_symbol(A()) + for i in indices + checkbounds(kmer, i) + (_, data) = + leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) + end + T(unsafe, data) +end + @inline function BioSequences.bitindex(kmer::Kmer, i::Unsigned)::Tuple{UInt, UInt} bps = BioSequences.bits_per_symbol(kmer) % UInt bpe = (8 * sizeof(UInt)) % UInt diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index 3fff470..9838536 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -33,11 +33,11 @@ load_source(x::FwKmers) = x.seq function Base.length(it::FwKmers{A, K, S}) where {A, K, S} src = used_source(RecodingScheme(A(), S), it.seq) - length(src) - ksize(eltype(it)) + 1 + max(0, length(src) - ksize(eltype(it)) + 1) end # Constructors -FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)} +FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)}(s) const FwDNAMers{K, S} = FwKmers{DNAAlphabet{2}, K, S} const FwRNAMers{K, S} = FwKmers{RNAAlphabet{2}, K, S} diff --git a/src/kmer.jl b/src/kmer.jl index 0e6dee9..2ad94b8 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -48,6 +48,9 @@ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} end end +# Useful to do e.g. `mer"TAG"d isa Mer{3}` +const Mer{K} = Kmer{<:Alphabet, K} + # Aliases "Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`" const DNAKmer{K, N} = Kmer{DNAAlphabet{2}, K, N} @@ -146,11 +149,6 @@ function Base.show(io::IO, ::MIME"text/plain", s::Kmer) print(io, s) end -function Base.print(io::IO, s::Kmer) - # TODO: Can be optimised but whatever - print(io, LongSequence(s)) -end - @inline function _cmp(x::Kmer{A1, K1}, y::Kmer{A2, K2}) where {A1, A2, K1, K2} if K1 < K2 -1 @@ -181,7 +179,7 @@ function push(kmer::Kmer, s) A = Alphabet(kmer) newT = derive_type(Kmer{typeof(A), length(kmer) + 1}) # If no free space in data, add new tuple - new_data = if n_unused(typeof(kmer)) < bps + new_data = if bits_unused(typeof(kmer)) < bps (zero(UInt), kmer.data...) else kmer.data @@ -224,7 +222,7 @@ function push_first(kmer::Kmer{A}, s) where {A} bps = BioSequences.bits_per_symbol(A()) newT = derive_type(Kmer{A, length(kmer) + 1}) # If no free space in data, add new tuple - new_data = if n_unused(typeof(kmer)) < bps + new_data = if bits_unused(typeof(kmer)) < bps (zero(UInt), kmer.data...) else kmer.data diff --git a/src/transformations.jl b/src/transformations.jl index a142df6..6ea7ad5 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -27,7 +27,7 @@ end # Generic fallback function BioSequences.complement(x::Kmer{<:NucleicAcidAlphabet}) - construct_generic_unchecked(Base.HasLength(), typeof(x), (complement(i) for i in x)) + typeof(x)((complement(i) for i in x)) end # TODO: Should this be the generic BioSequence def in BioSequences.jl? diff --git a/test/runtests.jl b/test/runtests.jl index 073ae1a..9963ba3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -87,7 +87,6 @@ const ALPHABETS = [ end end - # Construct from string @testset "Construct from string" begin for s in ["TAG", "ACCGAGCC", "TGATGCTATTAGG"] L = length(s) @@ -108,8 +107,6 @@ const ALPHABETS = [ @test [Char(i) for i in sq] == collect(ss) end end - - # Wrong length - also for iterators of unknown size end @testset "Wrong length" begin @@ -216,36 +213,324 @@ end end @testset "Access" begin - # Scalar indexing + @testset "Scalar indexing" begin + m = mer"TGATGCTAGTAGTATTCTATAG"d + @test m isa Mer{22} + + @test m[1] == first(m) == DNA_T + @test m[3] == DNA_A + @test last(m) == m[22] == DNA_G + + @test_throws BoundsError m[0] + @test_throws BoundsError m[-1] + @test_throws BoundsError m[23] + + for s in + [dna"TAGCAAC", dna"TWKKSVVDNA-A", rna"UGUGUCA", rna"UGUCGWS", aa"PLLKMDDSH"] + m = Kmer{typeof(Alphabet(s)), length(s)}(s) + @test first(m) == first(s) + @test last(m) == last(s) + for i in [1, 3, 5] + @test s[i] == m[i] + end + end + + # Weirdly, this throws ArgumentError + @test_throws Exception first(DNAKmer{0}("")) + @test_throws Exception last(RNAKmer{0}("")) + end + + @testset "Unit ranges" begin + m = mer"POKDGTWDIKVL"a + @test m isa Mer{12} + + @test m[1:3] === mer"POK"a + @test m[2:6] === mer"OKDGT"a + @test m[6:(end - 1)] === mer"TWDIKV"a + + @test m[eachindex(m)] === m + @test m[Base.OneTo(4)] === mer"POKD"a - # Index with UnitRange + @test_throws BoundsError m[0:4] + @test m[0:-1] == AAKmer{0}("") + @test_throws BoundsError m[2:13] + end + + @testset "With vector of indices" begin + m = mer"UGCUGAUCGUAU"r + @test m isa Mer{12} + + @test m[[1, 3, 5]] == mer"UCG"r + @test m[[12, 9, 7]] == mer"UGU"r + @test m[Int[]] == Kmer{RNAAlphabet{2}, 0}("") + + @test_throws BoundsError m[[2, 8, 15]] + @test_throws BoundsError m[[0, 1]] + @test_throws BoundsError m[[13]] + end - # Index with vector of indices + @testset "Logical indexing" begin + m = Kmer{CharAlphabet, 4}("ØÆGD") + @test m[[true, false, true, false]] == Kmer{CharAlphabet, 2}("ØG") + @test m[trues(4)] === m + @test m[falses(4)] === Kmer{CharAlphabet, 0}("") - # Boolean indexing + @test_throws BoundsError m[[true, false, true, true, true]] + @test_throws BoundsError m[[false, false, true]] + @test_throws BoundsError m[trues(5)] + end end @testset "Modification" begin - # Push, pushfirst + @testset "push, push_first" begin + m = mer"UHALSAP"a + @test push(m, AA_W) == mer"UHALSAPW"a + @test push(push(m, AA_W), AA_M) === mer"UHALSAPWM"a + @test push_first(m, AA_Gap) == mer"-UHALSAP"a + @test push_first(push(m, AA_K), AA_H) == mer"HUHALSAPK"a + + @test push(m, 'K') == mer"UHALSAPK"a + @test push(mer"TAG"d, RNA_A) == mer"TAGA"d + end + + @testset "shift, shiftfirst" begin + m = mer"PDOFPOLEF"a + v = collect(m) + for aa in aa"PLLMWFVB" + m = shift(m, aa) + @test m isa Mer{9} + popfirst!(push!(v, aa)) + @test collect(m) == v + end + + m = mer"AUGCGUA"r + v = collect(m) + for dna in dna"TAGTGTGCTA" + m = shift_first(m, dna) + @test m isa Mer{7} + pop!(pushfirst!(v, dna)) + @test collect(m) == v + end + end - # Shift, shift_first + @testset "pop" begin + m = mer"LNPQ"a + @test (m = pop(m)) == mer"LNP"a + @test (m = pop(m)) == mer"LN"a + @test (m = pop(m)) == mer"L"a + @test (m = pop(m)) == AAKmer{0}("") + @test_throws ArgumentError pop(m) - # Pop + @test pop(mer"MDFFIJFKL"a) === mer"MDFFIJFK"a + end end @testset "Biological operations" begin - # Reverse + for s in [ + dna"", + aa"", + LongDNA{2}(dna"TAGTGCA"), + LongRNA{2}(rna"UGCUGUAA"), + dna"TGASWKHVAAN--A", + rna"UAGUCUYMNS", + aa"LKHWSYYVQN", + LongSequence{CharAlphabet}("LKDSJ"), + LongSequence{CharAlphabet}("κ𝚶⊸∑Γ"), + ] + m = Kmer{typeof(Alphabet(s)), length(s)}(s) + + # Reverse + @test collect(reverse(m)) == reverse(collect(m)) + @test collect(reverse(m)) == collect(reverse(s)) + + # The rest of the operations are only for nucleotides + isa(Alphabet(s), NucleicAcidAlphabet) || continue + + # Complement + @test collect(complement(s)) == collect(complement(m)) + + # Reverse complement + rv = reverse_complement(m) + @test collect(reverse_complement(s)) == collect(rv) + + # Canonical + can = canonical(m) + @test collect(can) == collect(canonical(s)) + @test can ≤ m + if can === m + @test m ≤ rv + else + @test can === rv + @test rv ≤ m + end + end +end - # Complement +@testset "Translation" begin + @testset "CodonSet" begin + codons = [RNACodon((i, j, k)) for i in mer"UACG"r, j in mer"UACG"r, k in mer"UACG"r] + @test length(Set(codons)) == 64 + sources = [ + [], + codons[[1, 4, 8]], + codons, + codons[rand(Bool, 64)], + codons[[4, 8]], + ] + csets = map(CodonSet, sources) + sets = map(Set, sources) + + # Basic properties + for (cset, set) in zip(csets, sets) + @test cset == set + @test sort!(collect(cset)) == sort!(collect(set)) + @test length(cset) == length(set) + @test isempty(cset) == isempty(set) + for i in set + @test i ∈ cset + end + end - # Reverse complement + for (si, ci) in zip(sets, csets), (sj, cj) in zip(sets, csets) + @test issubset(si, sj) == issubset(ci, cj) + for f in [union, setdiff, intersect, symdiff] + @test Set(f(ci, cj)) == f(si, sj) + end + end + end - # Canonical + @testset "Standard reverse genetic code" begin + seq = LongAA(collect((i for i in alphabet(AminoAcid) if i ∉ (AA_Gap, AA_U, AA_O)))) + codonsets = reverse_translate(seq) + seen_codons = Set{RNACodon}() + for (codonset, aa) in zip(codonsets, seq) + if isambiguous(aa) + bits = zero(compatbits(aa)) + for codon in codonset + bits |= compatbits(only(translate(codon))) + end + # selenocysteine and Pyrrolysine have bits + # 0x00300000. However, translating normal + # codons cannot get these amino acids, + # so we ignore them by masking their bits + @test bits == (compatbits(aa) & 0x000fffff) + else + @test isdisjoint(seen_codons, codonset) + union!(seen_codons, codonset) + for codon in codonset + @test only(translate(codon)) == aa + end + end + end + @test length(seen_codons) == 64 + end + + @testset "Custom reverse genetic code" begin + # TODO! + end +end + +@testset "Printing" begin + function test_print(s, str) + @test string(s) == str + io = IOBuffer() + print(io, s) + @test String(take!(io)) == str + end + + for s in [ + dna"", + aa"", + LongDNA{2}(dna"TAGTGCA"), + LongRNA{2}(rna"UGCUGUAA"), + dna"TGASWKHVAAN--A", + rna"UAGUCUYMNS", + aa"LKHWSYYVQN", + ] + test_print(s, string(s)) + end end -@testset "Translation" begin end +@testset "Iterators" begin + @testset "Forward iteration" begin + @testset "Aliases" begin + @test FwKmers{DNAAlphabet{2}, 3}(dna"TAGA") isa FwKmers{DNAAlphabet{2}, 3, LongDNA{4}} + @test FwDNAMers{4}(rna"UAGC") isa FwKmers{DNAAlphabet{2}, 4, LongRNA{4}} + @test FwRNAMers{4}(dna"TACA") isa FwKmers{RNAAlphabet{2}, 4, LongDNA{4}} + @test FwAAMers{4}(aa"LKCY") isa FwKmers{AminoAcidAlphabet, 4, LongAA} + end + + @testset "Smaller than K" begin + @test isempty(FwDNAMers{3}(dna"TA")) + @test isempty(FwAAMers{9}(aa"AOPJVPES")) + @test isempty(FwKmers{RNAAlphabet{4}, 6}(dna"ATGGA")) + end -@testset "Iterators" begin end + @testset "Conversible alphabets" begin + for (seqs, alphabets) in [ + ([ + LongDNA{2}("TGATGGCGTAGTA"), + LongRNA{2}("UCGUGCUA"), + LongDNA{2}("") + ], [ + DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4} + ]), # From two-bit + ([ + dna"TAGTCTGAC", + rna"UAGUCGAUUAGGCC" + ], [ + DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4} + ]), # From four-bit + ] + for seq in seqs, alphabet in alphabets + v1 = collect(FwKmers{alphabet, 3}(seq)) + v2 = [Kmer{alphabet, 3, 1}(seq[i:i+2]) for i in 1:length(seq)-2] + @test v1 == v2 + end + end + for seq in [ + dna"TGWSNVNTGA", rna"C-GGAU-WSNUCG" + ] + @test_throws Exception first(FwDNAMers{3}(seq)) + @test_throws Exception first(FwRNAMers{3}(seq)) + end + end + + @testset "Four to two bit" begin + for seq in [ + dna"TATGCTTCGTAGTCGTCGTTGCTA", + ] + for seqq in [seq, LongRNA{4}(seq)] + filtered = typeof(seqq)([i for i in seqq if !isambiguous(i)]) + for A in [DNAAlphabet{2}, RNAAlphabet{2}] + v1 = collect(FwKmers{A, 4}(seqq)) + v2 = [Kmer{A, 4, 1}(filtered[i:i+3]) for i in 1:length(filtered)-3] + @test v1 == v2 + end + end + end + end + + @testset "From ASCII bytes" begin + str = "TaghWS-TGnADbkWWMSTV" + T = FwKmers{DNAAlphabet{4}, 4} + mers = collect(T(str)) + for source in [ + str, + view(str, 1:lastindex(str)), + codeunits(str), + Vector(codeunits(str)), + ] + @test collect(T(source)) == mers + end + end + + # Unconvertible alphabet + # TODO + # Error in the constructor? + #@test_throws FwDNAMers{} + end +end # include("construction_and_conversion.jl") # include("comparisons.jl") diff --git a/test/utils.jl b/test/utils.jl index 867ebca..3761510 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -66,10 +66,6 @@ function random_rna_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) return random_seq(n, ['A', 'C', 'G', 'U', 'N'], probs, Vector{RNA}) end -function random_rna_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) - return random_seq(n, ['A', 'C', 'G', 'U', 'N'], probs, Vector{RNA}) -end - function random_aa_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) return random_seq( n, From 648c4bdf012c877b7a95a4ab3938a2d4e35c646d Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 29 Dec 2023 11:25:47 +0100 Subject: [PATCH 20/33] Add Canonical and UnambiguousKmers --- src/Kmers.jl | 9 +- src/construction.jl | 27 ++++ src/iterators/CanonicalKmers.jl | 254 ++++++++++++++---------------- src/iterators/FwKmers.jl | 13 +- src/iterators/SpacedKmers.jl | 114 -------------- src/iterators/UnambiguousKmers.jl | 123 +++++++++++++++ src/iterators/common.jl | 8 - test/runtests.jl | 57 +++---- 8 files changed, 298 insertions(+), 307 deletions(-) delete mode 100644 src/iterators/SpacedKmers.jl create mode 100644 src/iterators/UnambiguousKmers.jl diff --git a/src/Kmers.jl b/src/Kmers.jl index e82ac10..56a871c 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -31,6 +31,12 @@ export Kmer, FwDNAMers, FwRNAMers, FwAAMers, + CanonicalKmers, + CanonicalDNAMers, + CanonicalRNAMers, + UnambiguousKmers, + UnambiguousDNAMers, + UnambiguousRNAMers, # Reverse translation CodonSet, @@ -148,7 +154,8 @@ include("revtrans.jl") include("iterators/common.jl") include("iterators/FwKmers.jl") -#include("iterators/CanonicalKmers.jl") +include("iterators/CanonicalKmers.jl") +include("iterators/UnambiguousKmers.jl") #include("iterators/SpacedKmers.jl") end # module diff --git a/src/construction.jl b/src/construction.jl index 041d19b..3843b55 100644 --- a/src/construction.jl +++ b/src/construction.jl @@ -247,6 +247,33 @@ end # String literals ################################################ +""" + @mer_str -> Kmer + +Construct a `Kmer` from the given string. The macro must be used with a flag +after the input string, e.g. `d` in `mer"TAG"d` or `a` in `mer"PCW"a`, signifying +the alphabet of the kmer. +The flags `d = DNAAlphabet{2}`, `r = RNAAlphabet{2}` and `a = AminoAcidAlphabet` +are recognized. + +Because the macro is resolved and the kmer is created at parse time, +the macro is type stable, and may be used in high performance code. + +# Examples +```jldoctest +julia> mer"UGCUA"r +RNA 5-mer: +UGCUA + +julia> mer"YKVSTEDLLKKR"a +AminoAcid 12-mer: +YKVSTEDLLKKR + +julia> mer"TATTAGCA"d +DNA 8-mer: +TATTAGCA +``` +""" macro mer_str(seq, flag) trimmed = BioSequences.remove_newlines(seq) ncu = ncodeunits(trimmed) diff --git a/src/iterators/CanonicalKmers.jl b/src/iterators/CanonicalKmers.jl index 960cf7d..a37cd5a 100644 --- a/src/iterators/CanonicalKmers.jl +++ b/src/iterators/CanonicalKmers.jl @@ -1,82 +1,111 @@ -struct CanonicalKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} +""" + CanonicalKmers{A <: NucleicAcidAlphabet, K, S} + +Iterator of canonical nucleic acid kmers. The result of this iterator is equivalent +to calling `canonical` on each value of a `FwKmers` iterator, but may be more +efficient. + +!!! note + When counting small kmers, it may be more efficient to count `FwKmers`, + then call `canonical` only once per unique kmer. + +Can be constructed more conventiently with the constructors `CanonicalDNAMers{K}(s)` +`CanonicalRNAMers{K}(s)` + +# Examples: +```jldoctest +julia> collect(CanonicalRNAMers{3}("AGCGA")) +3-element Vector{Kmer{RNAAlphabet{2}, 4, 1}}: + AGC + CGC + CGA +``` +""" +struct CanonicalKmers{A <: NucleicAcidAlphabet, K, S} <: AbstractKmerIterator{A, K} it::FwKmers{A, K, S} end source_type(::Type{CanonicalKmers{A, K, S}}) where {A, K, S} = S load_source(x::CanonicalKmers) = x.it.seq -Base.length(it::CanonicalKmers) = length(it.it) +@inline Base.length(it::CanonicalKmers) = length(it.it) # Constructors -function CanonicalKmers{A, K}(s::S) where {S, A <: Alphabet, K} - CanonicalKmers{S, A, K}(FwKmers{A, K}(s)) +function CanonicalKmers{A, K}(s::S) where {S, A <: NucleicAcidAlphabet, K} + CanonicalKmers{A, K, S}(FwKmers{A, K}(s)) end +function CanonicalKmers{A, K, S}(s::S) where {S, A <: NucleicAcidAlphabet, K} + CanonicalKmers{A, K, S}(FwKmers{A, K}(s)) +end + +const CanonicalDNAMers{K, S} = CanonicalKmers{DNAAlphabet{2}, K, S} +const CanonicalRNAMers{K, S} = CanonicalKmers{RNAAlphabet{2}, K, S} -# Iteration -function Base.iterate(it::CanonicalKmers, state...) - iterate_kmer(RecodingScheme(typeof(it)), it, state...) +@inline function Base.iterate(it::CanonicalKmers{A, K, S}, state...) where {A, K, S} + iterate_kmer(RecodingScheme(A(), S), it, state...) end -# For these recoding schemes, no symbols in the source sequence are skipped. -# Hence, we can forward to just `extract`. -# Here, instead of reverse complementing each symbol, it's more efficient -# to do it in bulk by RC'ing the entire kmer +# For the first kmer, we extract it, then reverse complement. +# When it's not done incrementally, it's faster to RC the whole +# kmer at once. +@inline function iterate_kmer(R::RecodingScheme, it::CanonicalKmers) + length(it.it.seq) < ksize(eltype(it)) && return nothing + fw = unsafe_extract(R, eltype(it), it.it.seq, 1) + rv = reverse_complement(fw) + (fw < rv ? fw : rv, (fw, rv, ksize(eltype(it)) + 1)) +end + +# Here, we need to convert to an abstractvector @inline function iterate_kmer( - R::Union{GenericAlphabet, Copyable, TwoToFour, AsciiEncode, GenericBytes}, - it::CanonicalKmers, -) - src = usable_source(it) + R::AsciiEncode, + it::CanonicalKmers{A, K, S}, +) where {A <: NucleicAcidAlphabet, K, S <: Bytes} + src = used_source(RecodingScheme(A(), S), it.it.seq) + Base.require_one_based_indexing(src) length(src) < ksize(eltype(it)) && return nothing - fw = extract(R, eltype(it), src, 1) + fw = unsafe_extract(R, eltype(it), src, 1) rv = reverse_complement(fw) - (min(fw, rv), (fw, rv, ksize(eltype(it)) + 1)) + (fw < rv ? fw : rv, (fw, rv, ksize(eltype(it)) + 1)) end -# Fallback: Just because it's Copyable doesn't mean we have neat bit-tricks -# to RC the encoding @inline function iterate_kmer( - ::Union{GenericAlphabet, Copyable}, + ::GenericRecoding, it::CanonicalKmers, state::Tuple{Kmer, Kmer, Int}, ) - src = usable_source(it) (fw, rv, i) = state - i > length(src) && return nothing - symbol = @inbounds src[i] - encoding = UInt(BioSequences.encode(Alphabet(typeof(fw)), symbol))::UInt - rc_encoding = UInt(BioSequences.encode(Alphabet(typeof(fw)), complement(symbol)))::UInt - fw = shift_encoding(kmer, encoding) - rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rc), (fw, rv, i + 1)) + i > length(it.it.seq) && return nothing + symbol = convert(eltype(fw), @inbounds it.it.seq[i]) + fw = shift(fw, symbol) + rv = shift_first(rv, complement(symbol)) + (fw < rv ? fw : rv, (fw, rv, i + 1)) end @inline function iterate_kmer( ::Copyable, - it::CanonicalKmers{<:TwoBit, K, <:TwoBit}, + it::CanonicalKmers{<:TwoBit, K, <:BioSequence{<:TwoBit}}, state::Tuple{Kmer, Kmer, Int}, ) where {K} - src = usable_source(it) (fw, rv, i) = state - i > length(src) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(src, i))::UInt - rc_encoding = encoding ⊻ UInt(3) - fw = shift_encoding(kmer, encoding) - rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rc), (fw, rv, i + 1)) + i > length(it.it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, i)) + fw = shift_encoding(fw, encoding) + rv = shift_first_encoding(rv, encoding ⊻ 0x03) + (fw < rv ? fw : rv, (fw, rv, i + 1)) end @inline function iterate_kmer( ::Copyable, - it::CanonicalKmers{<:FourBit, K, <:FourBit}, + it::CanonicalKmers{<:FourBit, K, <:BioSequence{<:FourBit}}, state::Tuple{Kmer, Kmer, Int}, ) where {K} - src = usable_source(it) (fw, rv, i) = state - i > length(src) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(src, i))::UInt - rc_encoding = (@inbounds(FOURBIT_COMPLEMENT_LUT[encoding + UInt(1)])) % UInt - fw = shift_encoding(kmer, encoding) + i > length(it.it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, i)) + fw = shift_encoding(fw, encoding) + rc_encoding = + reinterpret(UInt8, complement(reinterpret(eltype(rv), encoding % UInt8))) % UInt rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rc), (fw, rv, i + 1)) + (fw < rv ? fw : rv, (fw, rv, i + 1)) end @inline function iterate_kmer( @@ -84,117 +113,62 @@ end it::CanonicalKmers, state::Tuple{Kmer, Kmer, Int}, ) - src = usable_source(it) (fw, rv, i) = state - i > length(src) && return nothing - twobit_encoding = UInt(BioSequences.extract_encoded_element(src, i))::UInt - fw_encoding = reinterpret(UInt8, decode(Alphabet(fw), twobit_encoding)) % UInt - rc_encoding = reinterpret(UInt8, decode(Alphabet(fw), twobit_encoding ⊻ UInt(3))) % UInt - fw = shift_encoding(kmer, encoding) - rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rc), (fw, rv, i + 1)) + i > length(it.it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, i)) + fw = shift_encoding(fw, left_shift(UInt(1), encoding)) + rv = shift_first_encoding(rv, left_shift(UInt(1), encoding ⊻ 0x03)) + (fw < rv ? fw : rv, (fw, rv, i + 1)) end -# 4 -> 2 (skipping): Ascii skipping LUT, as with FwKmers - DEFAULT STATE @inline function iterate_kmer( - ::Skipping, - it::CanonicalKmers{A, K}, - state::Tuple{Kmer, Kmer, Int, Int}=(zero_kmer(Kmer{A, K}), zero_kmer(Kmer{A, K}), K, 1), -) where {A, K} - src = usable_source(it) - (fw, rv, remaining, i) = state - while !iszero(remaining) - i > length(src) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(src, i))::UInt - i += 1 - if isone(count_ones(encoding)) - fw_encoding = trailing_zeros(encoding) % UInt - fw = shift_encoding(fw, fw_encoding) - rv = shift_first_encoding(rv, fw_encoding ⊻ UInt(3)) - remaining -= 1 - else - remaining = K - # No need to RC anything - continue - end - end - return (min(fw, rv), (fw, rv, 1, i)) -end - -@inline function iterate_kmer( - ::AsciiSkipping, - it::CanonicalKmers{A, K}, - state::Tuple{Kmer, Kmer, Int, Int}=(zero_kmer(Kmer{A, K}), zero_kmer(Kmer{A, K}), K, 1), -) where {A, K} - src = usable_source(it) - Base.require_one_based_indexing(src) - (fw, rv, remaining, i) = state - while !iszero(remaining) - i > length(src) && return nothing - byte = @inbounds src[i] - i += 1 - encoding = @inbounds BYTE_LUT[byte + 0x01] - encoding == 0xff && throw_bad_byte_error(byte) - if encoding == 0xf0 - remaining = K - continue - else - fw = shift_encoding(fw, encoding) - rv = shift_first_encoding(rv, encoding ⊻ UInt(3)) - remaining -= 1 - end - end - return (min(fw, rv), (fw, rv, 1, i)) -end - -@inline function iterate_kmer( - ::AsciiEncode, - it::CanonicalKmers, + ::FourToTwo, + it::CanonicalKmers{A, K, <:BioSequence}, state::Tuple{Kmer, Kmer, Int}, -) - src = usable_source(it) - Base.require_one_based_indexing(src) +) where {A, K} (fw, rv, i) = state - A = Alphabet(typeof(fw)) - i > length(src) && return nothing - encoding = UInt(BioSequences.ascii_encode(A, @inbounds(src[i])))::UInt - rc_encoding = - UInt(BioSequences.encode(A, complement(BioSequences.decode(A, encoding))))::UInt - fw = shift_encoding(kmer, encoding) - rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rc), (fw, rv, i + 1)) + i > length(it.it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, i))::UInt + if count_ones(encoding) != 1 + throw( + BioSequences.EncodeError( + Alphabet(fw), + reinterpret(eltype(it.it.seq), encoding % UInt8), + ), + ) + end + enc = trailing_zeros(encoding) % UInt + fw = shift_encoding(fw, enc) + rv = shift_first_encoding(rv, enc ⊻ 0x03) + (fw < rv ? fw : rv, (fw, rv, i + 1)) end @inline function iterate_kmer( ::AsciiEncode, - it::CanonicalKmers{<:FourBit}, - state::Tuple{Kmer, Kmer, Int}, -) - src = usable_source(it) - Base.require_one_based_indexing(src) - (fw, rv, i) = state - A = Alphabet(typeof(fw)) - i > length(src) && return nothing - encoding = UInt(BioSequences.ascii_encode(A, @inbounds(src[i])))::UInt - rc_encoding = @inbounds(FOURBIT_COMPLEMENT_LUT[encoding + 0x01]) % UInt - fw = shift_encoding(kmer, encoding) - rv = shift_first_encoding(rv, rc_encoding) - (min(fw, rc), (fw, rv, i + 1)) -end - -@inline function iterate_kmer( - ::GenericBytes, it::CanonicalKmers, state::Tuple{Kmer, Kmer, Int}, ) - src = usable_source(it) + src = used_source( + RecodingScheme(Alphabet(eltype(it)), source_type(typeof(it))), + it.it.seq, + ) Base.require_one_based_indexing(src) (fw, rv, i) = state i > length(src) && return nothing - char = reinterpret(Char, (src[i] % UInt32) << 24) - fw_symbol = eltype(fw)(char) - rc_symbol = complement(fw_symbol) - fw = shift(fw, fw_symbol) - rv = shift(rv, rc_symbol) - (min(fw, rc), (fw, rv, i + 1)) + byte = @inbounds src[i] + encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), byte) + if encoding > 0x7f + throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) + end + # Hopefully this branch is eliminated at compile time... + rc_encoding = if Alphabet(fw) isa FourBit + reinterpret(UInt8, complement(reinterpret(DNA, encoding))) + elseif Alphabet(fw) isa TwoBit + encoding ⊻ 0x03 + else + error("Unreachable") + end + fw = shift_encoding(fw, encoding % UInt) + rv = shift_first_encoding(rv, rc_encoding % UInt) + (fw < rv ? fw : rv, (fw, rv, i + 1)) end diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index 9838536..1f5470b 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -28,10 +28,10 @@ struct FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} end end -source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S -load_source(x::FwKmers) = x.seq +source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S # TODO: Can be deleted? +load_source(x::FwKmers) = x.seq # TODO: Can be deleted? Is it unused, here and other defs -function Base.length(it::FwKmers{A, K, S}) where {A, K, S} +@inline function Base.length(it::FwKmers{A, K, S}) where {A, K, S} src = used_source(RecodingScheme(A(), S), it.seq) max(0, length(src) - ksize(eltype(it)) + 1) end @@ -43,11 +43,8 @@ const FwDNAMers{K, S} = FwKmers{DNAAlphabet{2}, K, S} const FwRNAMers{K, S} = FwKmers{RNAAlphabet{2}, K, S} const FwAAMers{K, S} = FwKmers{AminoAcidAlphabet, K, S} -FwDNAMers{K}(s) where {K} = FwDNAMers{K, typeof(s)}(s) -FwRNAMers{K}(s) where {K} = FwRNAMers{K, typeof(s)}(s) -FwAAMers{K}(s) where {K} = FwAAMers{K, typeof(s)}(s) - -function Base.iterate(it::FwKmers{A, K, S}, state...) where {A, K, S} +# TODO: Should this go in common? +@inline function Base.iterate(it::FwKmers{A, K, S}, state...) where {A, K, S} iterate_kmer(RecodingScheme(A(), S), it, state...) end diff --git a/src/iterators/SpacedKmers.jl b/src/iterators/SpacedKmers.jl deleted file mode 100644 index 48093d7..0000000 --- a/src/iterators/SpacedKmers.jl +++ /dev/null @@ -1,114 +0,0 @@ -""" - SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} - -An iterator over every valid `T<:Kmer` separated by a `step` parameter, in a given -longer `BioSequence`, between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by `max(0, K - step)` bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not consistent, but the - reading frame will be preserved. - In addition, the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct SpacedKmers{A <: Alphabet, K, St, S} <: AbstractKmerIterator{A, K} - seq::S - - function SpacedKmers{A, K, St, S}(seq) where {A, K, St, S} - (K isa Int && K > 0) || error("K must be an Int > 0") - (St isa Int && St > 0) || error("St must be an Int > 0") - new{A, K, St, S}(seq) - end -end - -source_type(::Type{SpacedKmers{A, K, St, S}}) where {A, K, St, S} = S - -function Base.length(it::SpacedKmers{A, K, St}) where {A, K, St} - Base.IteratorSize(typeof(it)) == Base.HasLength() || throw(MethodError(length, (it,))) - available_starting_positions = length(it.seq) - ksize(eltype(it)) + 1 - div(available_starting_positions, St) -end - -# Constructors -SpacedKmers{A, K, St}(seq) where {A, K, St} = SpacedKmers{A, K, St, typeof(seq)}(seq) - -# Iterators -function Base.iterate(it::SpacedKmers{A, K, St, <:BioSequence{A}}) where {A, K, St} - if St ≥ K - iterate_copy_nomask(it, 1) - else - x = iterate_copy_nomask(it, 1) - x === nothing && return nothing - (kmer, _) = x - return (kmer, (K + 1, kmer.data)) - end -end - -function Base.iterate(it::SpacedKmers{A, K, St, <:BioSequence{A}}, state) where {A, K, St} - if St ≥ K - iterate_copy_nomask(it, state) - else - iterate_copy_mask(it, state) - end -end - -# Called when St ≥ K, and the encoding in seq matches that of the kmer. -# We can build the kmer from scratch at every iteration, simplifying the code -@inline function iterate_copy_nomask(it::SpacedKmers{A, K, St}, state::Int) where {A, K, St} - seq = it.seq - len = length(seq) - bps = BioSequences.bits_per_symbol(A()) - remaining = K - data = zero_tuple(eltype(it)) - while true - state > len && return nothing - encoding = UInt(BioSequences.extract_encoded_element(seq, state)) - (_, data) = leftshift_carry(data, bps, encoding) - state += 1 - remaining -= 1 - iszero(remaining) && return (eltype(it)(unsafe, data), state + max(0, St - K)) - end -end - -# Called when St < K, and the encoding in seq matches that of the kmer. -# We can copy the encoding right over, and we need to preserve some data in the kmer -# between iterations -@inline function iterate_copy_mask( - it::SpacedKmers{A, K, St}, - state::Tuple{Int, Tuple{Vararg{UInt}}}, -) where {A, K, St} - seq = it.seq - len = length(seq) - bps = BioSequences.bits_per_symbol(A()) - remaining = St - (index, data) = state - while true - index > len && return nothing - encoding = UInt(BioSequences.extract_encoded_element(seq, index)) - (_, data) = leftshift_carry(data, bps, encoding) - index += 1 - remaining -= 1 - if iszero(remaining) - # Mask out unused bits before we return the kmer. - (head, rest...) = data - kmer = eltype(it)(unsafe, (head & get_mask(eltype(it)), rest...)) - return (kmer, (index, data)) - end - end -end - -# TODO: Methods: -# 4 -> 2 bit -# 2 -> 4 bit? -# Byte sequence: 2 bit -# Byte sequence: other alphabets diff --git a/src/iterators/UnambiguousKmers.jl b/src/iterators/UnambiguousKmers.jl new file mode 100644 index 0000000..a4bad4a --- /dev/null +++ b/src/iterators/UnambiguousKmers.jl @@ -0,0 +1,123 @@ +""" + UnambiguousKmers{A <: Union{DNAAlphabet{2}, RNAAlphabet{2}}, K, S} + +Iterator of 2-bit nucleic acid kmers. This differs from `FwKmers` in that any kmers +containing ambiguous nucleotides are skipped, whereas using `FwKmers`, they result +in an error. + +Can be constructed more conventiently with the constructors `UnambiguousDNAMers{K}(s)` +and `UnambiguousRNAMers{K}(s)`. + +!!! note + To obtain canonical unambiguous kmers, simply call `canonical` on each kmer output +by `UnambiguousKmers`. + +# Examples: +``` +julia> it = UnambiguousRNAMers{4}(dna"TGAGCWKCATC"); + +julia> collect(it) +3-element Vector{Kmer{RNAAlphabet{2}, 4, 1}}: + UGAG + GAGC + CAUC +``` +""" +struct UnambiguousKmers{A <: TwoBit, K, S} <: AbstractKmerIterator{A, K} + it::FwKmers{A, K, S} +end + +Base.IteratorSize(::Type{<:UnambiguousKmers}) = Base.SizeUnknown() +source_type(::Type{UnambiguousKmers{A, K, S}}) where {A, K, S} = S +load_source(x::UnambiguousKmers) = x.it.seq + +# Constructors +function UnambiguousKmers{A, K}(s::S) where {S, A <: TwoBit, K} + UnambiguousKmers{A, K, S}(FwKmers{A, K}(s)) +end +function UnambiguousKmers{A, K, S}(s::S) where {S, A <: TwoBit, K} + UnambiguousKmers{A, K, S}(FwKmers{A, K}(s)) +end + +const UnambiguousDNAMers{K, S} = UnambiguousKmers{DNAAlphabet{2}, K, S} +const UnambiguousRNAMers{K, S} = UnambiguousKmers{RNAAlphabet{2}, K, S} + +@inline function Base.iterate(it::UnambiguousKmers{A, K, S}) where {A, K, S} + state = (eltype(it)(unsafe, zero_tuple(eltype(it))), ksize(eltype(it)), 1) + iterate_kmer(RecodingScheme(A(), S), it, state) +end + +@inline function Base.iterate(it::UnambiguousKmers{A, K, S}, state) where {A, K, S} + iterate_kmer(RecodingScheme(A(), S), it, state) +end + +@inline function iterate_kmer( + ::RecodingScheme, + it::UnambiguousKmers, + state::Tuple{Kmer, Int, Int}, +) + (kmer, remaining, index) = state + while !iszero(remaining) + index > lastindex(it.it.seq) && return nothing + symbol = convert(eltype(kmer), it.it.seq[index]) + index += 1 + if isambiguous(symbol) + remaining = ksize(eltype(it)) + else + remaining -= 1 + kmer = shift(kmer, symbol) + end + end + (kmer, (kmer, 1, index)) +end + +# Here, we can forward directly to FwKmers +@inline function iterate_kmer( + ::Copyable, + it::UnambiguousKmers, + state::Tuple{Kmer, Int, Int}, +) + (kmer, _, index) = state + iterate(it.it, (kmer, index)) +end + +@inline function iterate_kmer( + ::AsciiEncode, + it::UnambiguousKmers{A, K, S}, + state::Tuple{Kmer, Int, Int}, +) where {A <: TwoBit, K, S} + src = used_source(RecodingScheme(A(), S), it.it.seq) + Base.require_one_based_indexing(src) + (kmer, remaining, index) = state + while !iszero(remaining) + index > lastindex(src) && return nothing + byte = @inbounds src[index] + index += 1 + encoding = @inbounds ASCII_SKIPPING_LUT[(byte + 0x01) % Int] + if encoding == 0xff + throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) + elseif encoding == 0xf0 + remaining = ksize(eltype(it)) + else + remaining -= 1 + kmer = shift_encoding(kmer, encoding % UInt) + end + end + (kmer, (kmer, 1, index)) +end + +@inline function iterate_kmer( + ::FourToTwo, + it::UnambiguousKmers{A, K, S}, + state::Tuple{Kmer, Int, Int}, +) where {A <: TwoBit, K, S} + (kmer, remaining, index) = state + while !iszero(remaining) + index > lastindex(it.it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, index))::UInt + kmer = shift_encoding(kmer, (trailing_zeros(encoding)) % UInt) + index += 1 + remaining = isone(count_ones(encoding)) ? remaining - 1 : ksize(eltype(it)) + end + (kmer, (kmer, 1, index)) +end diff --git a/src/iterators/common.jl b/src/iterators/common.jl index e680bde..6e2b2fa 100644 --- a/src/iterators/common.jl +++ b/src/iterators/common.jl @@ -51,11 +51,3 @@ const ASCII_SKIPPING_LUT = let end Tuple(v) end - -const FOURBIT_COMPLEMENT_LUT = let - v = fill(0x00, 16) - for i in alphabet(DNA) - v[reinterpret(UInt8, i) + 0x01] = reinterpret(UInt8, complement(i)) - end - Tuple(v) -end diff --git a/test/runtests.jl b/test/runtests.jl index 9963ba3..5194c4d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -370,13 +370,7 @@ end @testset "CodonSet" begin codons = [RNACodon((i, j, k)) for i in mer"UACG"r, j in mer"UACG"r, k in mer"UACG"r] @test length(Set(codons)) == 64 - sources = [ - [], - codons[[1, 4, 8]], - codons, - codons[rand(Bool, 64)], - codons[[4, 8]], - ] + sources = [[], codons[[1, 4, 8]], codons, codons[rand(Bool, 64)], codons[[4, 8]]] csets = map(CodonSet, sources) sets = map(Set, sources) @@ -424,7 +418,7 @@ end end @test length(seen_codons) == 64 end - + @testset "Custom reverse genetic code" begin # TODO! end @@ -454,7 +448,8 @@ end @testset "Iterators" begin @testset "Forward iteration" begin @testset "Aliases" begin - @test FwKmers{DNAAlphabet{2}, 3}(dna"TAGA") isa FwKmers{DNAAlphabet{2}, 3, LongDNA{4}} + @test FwKmers{DNAAlphabet{2}, 3}(dna"TAGA") isa + FwKmers{DNAAlphabet{2}, 3, LongDNA{4}} @test FwDNAMers{4}(rna"UAGC") isa FwKmers{DNAAlphabet{2}, 4, LongRNA{4}} @test FwRNAMers{4}(dna"TACA") isa FwKmers{RNAAlphabet{2}, 4, LongDNA{4}} @test FwAAMers{4}(aa"LKCY") isa FwKmers{AminoAcidAlphabet, 4, LongAA} @@ -468,43 +463,37 @@ end @testset "Conversible alphabets" begin for (seqs, alphabets) in [ - ([ - LongDNA{2}("TGATGGCGTAGTA"), - LongRNA{2}("UCGUGCUA"), - LongDNA{2}("") - ], [ - DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4} - ]), # From two-bit - ([ - dna"TAGTCTGAC", - rna"UAGUCGAUUAGGCC" - ], [ - DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4} - ]), # From four-bit + ( + [LongDNA{2}("TGATGGCGTAGTA"), LongRNA{2}("UCGUGCUA"), LongDNA{2}("")], + [DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4}], + ), # From two-bit + ( + [dna"TAGTCTGAC", rna"UAGUCGAUUAGGCC"], + [DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4}], + ), # From four-bit ] for seq in seqs, alphabet in alphabets v1 = collect(FwKmers{alphabet, 3}(seq)) - v2 = [Kmer{alphabet, 3, 1}(seq[i:i+2]) for i in 1:length(seq)-2] + v2 = [Kmer{alphabet, 3, 1}(seq[i:(i + 2)]) for i in 1:(length(seq) - 2)] @test v1 == v2 end end - for seq in [ - dna"TGWSNVNTGA", rna"C-GGAU-WSNUCG" - ] + for seq in [dna"TGWSNVNTGA", rna"C-GGAU-WSNUCG"] @test_throws Exception first(FwDNAMers{3}(seq)) @test_throws Exception first(FwRNAMers{3}(seq)) end end @testset "Four to two bit" begin - for seq in [ - dna"TATGCTTCGTAGTCGTCGTTGCTA", - ] + for seq in [dna"TATGCTTCGTAGTCGTCGTTGCTA"] for seqq in [seq, LongRNA{4}(seq)] filtered = typeof(seqq)([i for i in seqq if !isambiguous(i)]) for A in [DNAAlphabet{2}, RNAAlphabet{2}] v1 = collect(FwKmers{A, 4}(seqq)) - v2 = [Kmer{A, 4, 1}(filtered[i:i+3]) for i in 1:length(filtered)-3] + v2 = [ + Kmer{A, 4, 1}(filtered[i:(i + 3)]) for + i in 1:(length(filtered) - 3) + ] @test v1 == v2 end end @@ -515,12 +504,8 @@ end str = "TaghWS-TGnADbkWWMSTV" T = FwKmers{DNAAlphabet{4}, 4} mers = collect(T(str)) - for source in [ - str, - view(str, 1:lastindex(str)), - codeunits(str), - Vector(codeunits(str)), - ] + for source in + [str, view(str, 1:lastindex(str)), codeunits(str), Vector(codeunits(str))] @test collect(T(source)) == mers end end From 1192b722b8f530afef95c40dc8e398134017916e Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Fri, 29 Dec 2023 14:46:01 +0100 Subject: [PATCH 21/33] Update some docstrings --- src/kmer.jl | 140 ++++++++++++++++++++++++++++++++++++++++------- src/revtrans.jl | 13 ++++- test/runtests.jl | 51 +++++++++++++++++ 3 files changed, 181 insertions(+), 23 deletions(-) diff --git a/src/kmer.jl b/src/kmer.jl index 2ad94b8..eb33a2b 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -1,28 +1,30 @@ """ Kmer{A<:Alphabet,K,N} <: BioSequence{A} -A parametric, immutable, bitstype for representing k-mers - short sequences +An immutable bitstype for representing k-mers - short `BioSequences` of a fixed length `K`. Since they can be stored directly in registers, `Kmer`s are generally the most efficient type of `BioSequence`, when `K` is small and known at compile time. + The `N` parameter is derived from `A` and `K` and is not a free parameter. +See also: [`DNAKmer`](@ref), [`RNAKmer`](@ref), [`AAKmer`](@ref), [`AbstractKmerIterator`](@ref) + # Examples ```jldoctest -julia> m = Kmer{DNAAlphabet{4}}("AGCKN") # type-unstable -DNA 5-mer -AGCKN +julia> RNAKmer{5}("ACGUC") +RNA 5-mer +ACGUC -julia> length(m) == 5 -true +julia> Kmer{DNAAlphabet{4}, 6}(dna"TGCTTA") +DNA 6-mer +TGCTTA -julia> DNAKmer(dna"TGCTTA") isa DNAKmer{6} -true - -julia> AAKmer((lowercase(i) for i in "KLWYR")) isa AAKmer{5} -true +julia> AAKmer{5}((lowercase(i) for i in "KLWYR")) +AminoAcid 5-mer +TGCTTA -julia> RNAKmer{3}("UA") +julia> RNAKmer{3}("UAUC") # wrong length ERROR: [ ... ] ``` @@ -49,22 +51,40 @@ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} end # Useful to do e.g. `mer"TAG"d isa Mer{3}` +""" + Mer{K} + +Alias for `Kmer{<:Alphabet, K}`. Useful to dispatch on `K-mers` without regard +for the alphabat + +# Example +```jldoctest +julia> mer"DEKR"a isa Mer{4} +true + +julia> DNAKmer{2}("TGATCA") isa Mer{6} +true + +julia> RNACodon <: Mer{3} +true +``` +""" const Mer{K} = Kmer{<:Alphabet, K} # Aliases -"Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`" +"Alias for `Kmer{DNAAlphabet{2},K,N}`" const DNAKmer{K, N} = Kmer{DNAAlphabet{2}, K, N} -"Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`" +"Alias for `Kmer{RNAAlphabet{2},K,N}`" const RNAKmer{K, N} = Kmer{RNAAlphabet{2}, K, N} -"Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`" +"Alias for `Kmer{AminoAcidAlphabet,K,N}`" const AAKmer{K, N} = Kmer{AminoAcidAlphabet, K, N} -"Shorthand for `DNAKmer{3,1}`" +"Alias for `DNAKmer{3,1}`" const DNACodon = DNAKmer{3, 1} -"Shorthand for `RNAKmer{3,1}`" +"Alias for `RNAKmer{3,1}`" const RNACodon = RNAKmer{3, 1} """ @@ -174,6 +194,29 @@ Base.:(==)(x::BioSequence, y::Kmer) = throw(MethodError(==, (x, y))) Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) +""" + push(kmer::Kmer{A, K}, s)::Kmer{A, K+1} + +Create a new kmer which is the concatenation of `kmer` and `s`. +Returns a `K+1`-mer. + +!!! warn + Since the output of this function is a `K+1`-mer, use of this function + in a loop may result in type-instability. + +See also: [`push_first`](@ref), [`pop`](@ref), [`shift`](@ref) + +# Examples +```jldoctest +julia> shift(mer"UGCUGA"r, RNA_G) +RNA 7-mer +UGCUGAG + +julia> shift(mer"W"a, 'E') +AminoAcid 2-mer +WE +``` +""" function push(kmer::Kmer, s) bps = BioSequences.bits_per_symbol(kmer) A = Alphabet(kmer) @@ -191,9 +234,13 @@ function push(kmer::Kmer, s) end """ -shift(kmer::kmer, symbol)::typeof(kmer) + shift(kmer::Kmer{A, K}, s)::Kmer{A, K} Push `symbol` onto the end of `kmer`, and pop the first symbol in `kmer`. +Unlike `push`, this preserves the input type, and is less likely to result in +type instability. + +See also: [`shift_first`](@ref), [`push`](@ref) # Examples ```jldoctest @@ -201,7 +248,7 @@ julia> shift(mer"TACC"d, DNA_A) DNA 4-mer ACCA -julia> shift(mer"WKYMLPIIRS"aa, AA_F) +julia> shift(mer"WKYMLPIIRS"aa, 'F') AminoAcid 10-mer KYMLPIIRSF ``` @@ -218,6 +265,30 @@ end typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) end +""" + push_first(kmer::Kmer{A, K}, s)::Kmer{A, K+1} + +Create a new kmer which is the concatenation of `s` and `kmer`. +Returns a `K+1`-mer. Similar to [`push`](@ref), but places the new symbol `s` +at the front. + +!!! warn + Since the output of this function is a `K+1`-mer, use of this function + in a loop may result in type-instability. + +See also: [`push`](@ref), [`pop`](@ref), [`shift`](@ref) + +# Examples +```jldoctest +julia> shift(mer"GCU"r, RNA_G) +RNA 4-mer +GGCU + +julia> shift(mer"W"a, 'E') +AminoAcid 2-mer +EW +``` +""" function push_first(kmer::Kmer{A}, s) where {A} bps = BioSequences.bits_per_symbol(A()) newT = derive_type(Kmer{A, length(kmer) + 1}) @@ -238,13 +309,15 @@ end Push `symbol` onto the start of `kmer`, and pop the last symbol in `kmer`. +See also: [`shift`](@ref), [`push`](@ref) + # Examples ```jldoctest julia> shift_first(mer"TACC"d, DNA_A) DNA 4-mer ATAC -julia> shift_first(mer"WKYMLPIIRS"aa, AA_F) +julia> shift_first(mer"WKYMLPIIRS"aa, 'F') AminoAcid 10-mer FWKYMLPIIR ``` @@ -262,6 +335,33 @@ function shift_first_encoding(kmer::Kmer{A}, encoding::UInt) where {A} typeof(kmer)(unsafe, (head, tail...)) end +""" + pop(kmer::Kmer{A, K})::Kmer{A, K-1} + +Returns a new kmer with the last symbol of the input `kmer` removed. +Throws an `ArgumentError` if `kmer` is empty. + +!!! warn + Since the output of this function is a `K+1`-mer, use of this function + in a loop may result in type-instability. + +See also: [`push`](@ref), [`shift`](@ref) + +# Examples +```jldoctest +julia> pop(mer"TCTGTA"d) +DNA 5-mer +TCTGT + +julia> pop(mer"QPSY"a) +AminoAcid 3-mer +QPS + +julia> pop(mer""a) +ERROR: ArgumentError: +[...] +``` +""" function pop(kmer::Kmer{A}) where {A} isempty(kmer) && throw(ArgumentError("Cannot pop 0-mer")) bps = BioSequences.bits_per_symbol(A()) diff --git a/src/revtrans.jl b/src/revtrans.jl index e43a79e..55b9c71 100644 --- a/src/revtrans.jl +++ b/src/revtrans.jl @@ -24,8 +24,6 @@ Kmers.CodonSet with 4 elements: UAG UUU ``` - -See also: `push` """ struct CodonSet <: AbstractSet{RNACodon} x::UInt64 @@ -138,11 +136,20 @@ function Base.iterate(c::ReverseGeneticCode, s=1) end """ - reverse_translate!(v::Vector{CodonSet}, s::AASeq code=rev_standard_genetic_code) + reverse_translate!(v::Vector{CodonSet}, s::AASeq code=rev_standard_genetic_code) -> v Reverse-translates `s` under the reverse genetic code `code`, putting the result in `v`. See also: [`reverse_translate`](@ref) + +# Examples: +```jldoctest +julia> v = CodonSet[]; + +julia> reverse_translate!(v, aa"KWCL") +4-element Vector{CodonSet} +[...] +``` """ function reverse_translate!(v::Vector{CodonSet}, seq::AASeq, code=rev_standard_genetic_code) resize!(v, length(seq)) diff --git a/test/runtests.jl b/test/runtests.jl index 5194c4d..07c0bd4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -367,6 +367,57 @@ end end @testset "Translation" begin + @testset "Forward translation" begin + # Empty + @test translate(mer""r) == mer""a + @test translate(mer""d) == mer""a + @test translate(Kmer{DNAAlphabet{4}, 0}("")) == mer""a + + # Not divisible by 3 + @test_throws Exception translate(mer"U"r) + @test_throws Exception translate(mer"UGCA"r) + @test_throws Exception translate(mer"GUCGAUUGUC"r) + + # Containing gaps + @test_throws Exception translate(Kmer{DNAAlphabet{4}, 6}("CTGA-C")) + @test_throws Exception translate(Kmer{RNAAlphabet{4}, 3}("UC-")) + + # Invalid alphabet + @test_throws Exception transate(mer"CCC"a) + @test_throws Exception transate(Kmer{CharAlphabet, 3}("GGG")) + + # Compare to LongSequence + for s in [ + rna"UCGUAGUUCGAUUCUAUGCUGUAGUGGCAA", + rna"UCGUAGGCGUAUUGCGCAAAGCGC", + rna"UGCUAGUGUUCGAAA", + rna"UCGUUAGUAAAA", + ] + for A in [DNAAlphabet{4}, RNAAlphabet{2}, DNAAlphabet{2}, RNAAlphabet{4}] + ss = LongSequence{A}(s) + @test collect(translate(ss)) == collect(translate(Kmer{A, length(s)}(s))) + end + end + + for s in [ + rna"UGCUGAWKVUDUGWUGUDHUAGUGCNUBGKUGCMGGSWC", + rna"UCGUAGUCKGUCGUYCUGAGGWUGCUGANNUGCUGA", + rna"CAGGCCAGWGCUGSSSCUGSMGKYVUCUAS", + ] + for A in [DNAAlphabet{4}, RNAAlphabet{4}] + ss = LongSequence{A}(s) + @test collect(translate(ss)) == collect(translate(Kmer{A, length(s)}(s))) + end + end + + # Skip 1, the index of gap (which cannot be translated) + A = alphabet(RNA) + for i in 2:16, j in 2:16, k in 2:16 + mer = Kmer{RNAAlphabet{4}, 3}((A[i], A[j], A[k])) + @test only(translate(mer)) == only(translate(LongSequence(mer))) + end + end + @testset "CodonSet" begin codons = [RNACodon((i, j, k)) for i in mer"UACG"r, j in mer"UACG"r, k in mer"UACG"r] @test length(Set(codons)) == 64 From 30952eb210987756aae3e7e014daf0806431e821 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 30 Dec 2023 10:48:32 +0100 Subject: [PATCH 22/33] Add some docs --- .gitignore | 1 + README.md | 32 +++--- docs/Project.toml | 4 +- docs/make.jl | 29 ++--- docs/src/construction.md | 12 --- docs/src/index.md | 56 ++++------ docs/src/iteration.md | 31 ------ docs/src/kmer_types.md | 92 ---------------- docs/src/kmers.md | 186 ++++++++++++++++++++++++++++++++ docs/src/predicates.md | 18 ---- docs/src/random.md | 14 --- docs/src/transforms.md | 52 --------- docs/src/translate.md | 62 ----------- docs/src/translation.md | 20 ++++ src/Kmers.jl | 8 +- src/construction.jl | 2 +- src/iterators/CanonicalKmers.jl | 3 +- src/iterators/FwKmers.jl | 9 +- src/iterators/common.jl | 13 +-- src/kmer.jl | 108 +++++++++++++------ src/revtrans.jl | 18 ++-- src/transformations.jl | 3 +- src/tuple_bitflipping.jl | 16 +-- 23 files changed, 377 insertions(+), 412 deletions(-) delete mode 100644 docs/src/construction.md delete mode 100644 docs/src/iteration.md delete mode 100644 docs/src/kmer_types.md create mode 100644 docs/src/kmers.md delete mode 100644 docs/src/predicates.md delete mode 100644 docs/src/random.md delete mode 100644 docs/src/transforms.md delete mode 100644 docs/src/translate.md create mode 100644 docs/src/translation.md diff --git a/.gitignore b/.gitignore index 8cb4c0b..37956a9 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .DS_Store Manifest.toml TODO.md +docs/build diff --git a/README.md b/README.md index 37f4b6a..2dd7595 100644 --- a/README.md +++ b/README.md @@ -3,34 +3,36 @@ [![Latest Release](https://img.shields.io/github/release/BioJulia/Kmers.jl.svg)](https://github.com/BioJulia/Kmers.jl/releases/latest) [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE) [![Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/Kmers.jl/stable) -[![Pkg Status](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) ## Description -Kmers provide the `Kmer <: BioSequence` type which implement the concept of a -[k-mer](https://en.wikipedia.org/wiki/K-mer). +Kmers.jl provide the `Kmer <: BioSequence` type which implement the concept of a +[k-mer](https://en.wikipedia.org/wiki/K-mer), a biological sequence of exactly length `k`. -A k-mer is a biological sequence of exactly length `k`. k-mers are used frequently -in bioinformatics because, when k is small and known at compile time, these -sequences can be efficiently represented as integers and stored directly in -CPU registers, allowing for much more efficient computation than arbitrary-length -sequences. + +K-mers are used frequently in bioinformatics because, when k is small and known at +compile time, these sequences can be efficiently represented as integers and stored +directly in CPU registers, allowing for much more efficient computation than arbitrary-length sequences. + +In Kmers.jl, the `Kmer` type is psrameterized by its length, and its data is stored in an `NTuple`. This makes `Kmers` bitstypes and highly efficient. Conceptually, one may use the following analogy: -* `BioSequence` is like `AbstractString` and `AbstractVector` -* `LongSequence` is like `String` and `Vector` -* `Kmer` is like [`InlineString`](https://github.com/JuliaStrings/InlineStrings.jl) - and [`SVector`](https://github.com/JuliaArrays/StaticArrays.jl) +* `BioSequence` is like `AbstractVector` +* `LongSequence` is like `Vector` +* `Kmer` is like [`SVector`](https://github.com/JuliaArrays/StaticArrays.jl) from `StaticArrays` Kmers.jl is tightly coupled to the [`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package, -and rely on its internals. +and relies on its internals. Hence, you should expect strict compat bounds on BioSequences.jl. ## Usage - ### ⚠️ WARNING ⚠️ `Kmer`s are parameterized by their length. That means any operation on `Kmer`s that change their length, such as `push`, `pop`, slicing, or masking (logical indexing) will be **type unstable** and hence slow and memory inefficient, unless you write your code in such as way that the compiler can use constant folding. +Further, as `Kmer`s are immutable and their operations are aggressively inlined and unrolled, +they become inefficent as they get longer. +For example, reverse-complementing a 32-mer takes 26 ns, compared to 102 ns for the equivalent `LongSequence`. However, for 512-mers, the `LongSequence` takes 126 ns, and the `Kmer` 16 μs! + Kmers.jl is intended for high-performance computing. If you do not need the extra performance that register-stored sequences provide, you might consider using `LongSequence` from BioSequences.jl instead ## Installation @@ -38,7 +40,7 @@ You can install BioSequences from the julia REPL. Press `]` to enter pkg mode, and enter the following: ```julia -add Kmers +pkg> add Kmers ``` If you are interested in the cutting edge of development, please check out diff --git a/docs/Project.toml b/docs/Project.toml index e064fd1..0c33fe1 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,7 @@ [deps] +BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Kmers = "445028e4-d31f-4f27-89ad-17affd83fc22" [compat] -Documenter = "0.24" \ No newline at end of file +Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl index 200452b..8c5d80c 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,24 +1,27 @@ using Documenter, Kmers +DocMeta.setdocmeta!( + Kmers, + :DocTestSetup, + :(using BioSequences, Kmers, Test); + recursive=true, +) + makedocs(; + modules=[Kmers], format=Documenter.HTML(), sitename="Kmers.jl", pages=[ "Home" => "index.md", - "Kmer types" => "kmer_types.md", - "Constructing kmers" => "construction.md", - "Indexing & modifying kmers" => "transforms.md", - "Predicates" => "predicates.md", - "Random kmers" => "random.md", - "Iterating over Kmers" => "iteration.md", - "Translation" => "translate.md", - #"Pattern matching and searching" => "sequence_search.md", - #"Iteration" => "iteration.md", - #"Counting" => "counting.md", - #"I/O" => "io.md", - #"Interfaces" => "interfaces.md" + "The Kmer type" => "kmers.md", + "Translation" => "translation.md", + # The kmer type (construction, indexing) + # Kmer iteration + # Translation (revtrans also) + # FAQ (why not compare to bioseq, why no unambig canonical) ], - authors="Ben J. Ward, The BioJulia Organisation and other contributors.", + authors="Jakob Nybo Nissen, Sabrina J. Ward, The BioJulia Organisation and other contributors.", + checkdocs=:exports, ) deploydocs(; diff --git a/docs/src/construction.md b/docs/src/construction.md deleted file mode 100644 index 6b300a9..0000000 --- a/docs/src/construction.md +++ /dev/null @@ -1,12 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Construction & conversion - -```@docs -Kmer{A,K,N}(itr) -``` diff --git a/docs/src/index.md b/docs/src/index.md index 11c4fe9..455f155 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,59 +1,49 @@ -# Kmers +## Kmers.jl +Kmers.jl provide the `Kmer <: BioSequence` type which implement the concept of a +[k-mer](https://en.wikipedia.org/wiki/K-mer), a biological sequence of exactly length `k`. -[![Latest Release](https://img.shields.io/github/release/BioJulia/Kmers.jl.svg)](https://github.com/BioJulia/Kmers.jl/releases/latest) -[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE) -[![Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/Kmers.jl/stable) -[![Pkg Status](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) +K-mers are used frequently in bioinformatics because, when k is small and known at +compile time, these sequences can be efficiently represented as integers and stored +directly in CPU registers, allowing for much more efficient computation than arbitrary-length sequences. -## Description +In Kmers.jl, the `Kmer` type is psrameterized by its length, and its data is stored in an `NTuple`. This makes `Kmers` bitstypes and highly efficient. -Kmers provides a specialised concrete `BioSequence` subtype, optimised for -representing short immutable sequences called kmers: contiguous sub-strings of k -nucleotides of some reference sequence. +Conceptually, one may use the following analogy: +* `BioSequence` is like `AbstractVector` +* `LongSequence` is like `Vector` +* `Kmer` is like [`SVector`](https://github.com/JuliaArrays/StaticArrays.jl) from `StaticArrays` -They are used extensively in bioinformatic analyses as an informational unit. -This concept was popularised by short read assemblers. -Analyses within the kmer space benefit from a simple formulation of the sampling -problem and direct in-hash comparisons. +Kmers.jl is tightly coupled to the +[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package, +and relies on its internals. +Hence, you should expect strict compat bounds on BioSequences.jl. -Kmers provides the type representing kmers as well as the implementations of -the APIs specified by the -[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package. +## Usage +### ⚠️ WARNING ⚠️ +`Kmer`s are parameterized by their length. That means any operation on `Kmer`s that change their length, such as `push`, `pop`, slicing, or masking (logical indexing) will be **type unstable** and hence slow and memory inefficient, unless you write your code in such as way that the compiler can use constant folding. -## Installation +Kmers.jl is intended for high-performance computing. If you do not need the extra performance that register-stored sequences provide, you might consider using `LongSequence` from BioSequences.jl instead +## Installation You can install BioSequences from the julia REPL. Press `]` to enter pkg mode, and enter the following: ```julia -add Kmers +pkg> add Kmers ``` -If you are interested in the cutting edge of the development, please check out +If you are interested in the cutting edge of development, please check out the master branch to try new features before release. - -## Testing - -Kmers is tested against Julia `1.X` on Linux, OS X, and Windows. - -[![Unit tests](https://github.com/BioJulia/Kmers.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/Kmers.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster) -[![Documentation](https://github.com/BioJulia/Kmers.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/BioKmers.jl/actions?query=workflow%3ADocumentation+branch%3Amaster) -[![](https://codecov.io/gh/BioJulia/Kmers.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/Kmers.jl) - - ## Contributing - We appreciate contributions from users including reporting bugs, fixing issues, improving performance and adding new features. Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct. - ## Questions? - If you have a question about contributing or using BioJulia software, come -on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the +on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/docs/src/iteration.md b/docs/src/iteration.md deleted file mode 100644 index 3b36181..0000000 --- a/docs/src/iteration.md +++ /dev/null @@ -1,31 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Iterating over kmers - -When introducing the `Kmer` type we described kmers as contiguous sub-strings of -k nucleotides of some reference sequence. - -This package therefore contains functionality for iterating over all the valid -`Kmers{A,K,N}` in a longer `BioSequence`. - -```@docs -EveryKmer -EveryKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} -EveryKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} -SpacedKmers -SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} -SpacedKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} -EveryCanonicalKmer -EveryCanonicalKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} -EveryCanonicalKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} -SpacedCanonicalKmers -SpacedCanonicalKmers{T}(seq::S, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} -SpacedCanonicalKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} -``` - - diff --git a/docs/src/kmer_types.md b/docs/src/kmer_types.md deleted file mode 100644 index aab9330..0000000 --- a/docs/src/kmer_types.md +++ /dev/null @@ -1,92 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Kmer types - -Bioinformatic analyses make extensive use of kmers. -Kmers are contiguous sub-strings of k nucleotides of some reference sequence. - -They are used extensively in bioinformatic analyses as an informational unit. -This concept popularised by short read assemblers. -Analyses within the kmer space benefit from a simple formulation of the sampling -problem and direct in-hash comparisons. - -BioSequences provides the following types to represent Kmers. - -```@docs -Kmer -``` - -The following aliases are also defined: - -```@docs -DNAKmer -DNA27mer -DNA31mer -DNA63mer -RNAKmer -RNA27mer -RNA31mer -RNA63mer -AAKmer -DNACodon -RNACodon -``` - - -### Skipmers - -For some analyses, the contiguous nature of kmers imposes limitations. -A single base difference, due to real biological variation or a sequencing error, -affects all k-mers crossing that position thus impeding direct analyses by identity. -Also, given the strong interdependence of local sequence, contiguous sections -capture less information about genome structure, and so they are more affected by -sequence repetition. - -Skipmers are a generalisation of the concept of a kmer. -They are created using a cyclic pattern of used-and-skipped positions which -achieves increased entropy and tolerance to nucleotide substitution differences -by following some simple rules. - -Skipmers preserve many of the elegant properties of kmers such as reverse -complementability and existence of a canonical representation. -Also, using cycles of three greatly increases the power of direct intersection -between the genomes of different organisms by grouping together the more conserved -nucleotides of protein-coding regions. - -BioSequences currently does not provide a separate type for skipmers, they are -represented using `Mer` and `BigMer` as their representation as a short immutable -sequence encoded in an unsigned integer is the same. -The distinction lies in how they are generated. - -#### Skipmer generation - -A skipmer is a simple cyclic q-gram that includes _m_ out of every _n_ bases -until a total of _k_ bases is reached. - -This is illustrated in the figure below (from -[this paper](https://www.biorxiv.org/content/biorxiv/early/2017/08/23/179960.full.pdf).): - -![skipmer-fig](skipmers.png) - -To maintain cyclic properties and the existence of the reverse-complement as a -skipmer defined by the same function, _k_ should be a multiple of _m_. - -This also enables the existence of a canonical representation for each skipmer, -defined as the lexicographically smaller of the forward and reverse-complement -representations. - -Defining _m_, _n_ and _k_ fixes a value for _S_, the total span of the skipmer, -given by: - -```math -S = n * (\frac{k}{m} - 1) + m -``` - -To see how to iterate over skipmers cf. kmers, see the Iteration section -of the manual. - diff --git a/docs/src/kmers.md b/docs/src/kmers.md new file mode 100644 index 0000000..c069742 --- /dev/null +++ b/docs/src/kmers.md @@ -0,0 +1,186 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` + +## The `Kmer` type +The central type of Kmers.jl is the `Kmer`. +A `Kmer` is an immutable, bitstype `BioSequence`, with a length known at compile +time. Compared to `LongSequence` in BioSequences.jl, +this gives to one advantage, and comes with two disadvantages: +* Kmers are much faster than `LongSequence`, as they can be stored in registers. +* As kmers gets longer, the code gets increasingly inefficient, as the unrolling + and inlining of the immutable operations breaks down. +* Since their length is part of their type, any operation that results in a kmer + whose length cannot be determined at compile time will be type unstable. + This includes slicing a kmer, pushing and popping it, and other operations. + +The `Kmer` type is (roughly) defined as +```julia +struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} + x::NTuple{N, UInt} +end +``` +Where: +* `A` is the `Alphabet` as defined in BioSequences.jl +* `K` is the length +* `N` is an extra type parameter derived from the first two, + which exists only because Julia does not allow computed type parameters. + +### Construction +Kmers can be constructed from a `BioSequence` or `AbstractString` by explicitly +specifying the length of the sequence: + +```jldoctest +julia> Kmer{DNAAlphabet{2}, 5, 1}("TAGCT") +DNA 5-mer: +TAGCT +``` + +The final type parameter can be elided, in which case it will be inferred: + +```jldoctest +julia> Kmer{DNAAlphabet{2}, 5}("TAGCT") +DNA 5-mer: +TAGCT +``` + +Kmers with alphabets `DNAAlphabet{2}`, `RNAAlphabet{2}` and `AminoAcidAlphabet` +can be created with the type aliases `DNAKmer`, `RNAKmer` and `AAKmer`: + +```jldoctest +julia> DNAKmer{3}("tag") +DNA 3-mer: +TAG + +julia> AAKmer{5}("PWYSK") +AminoAcid 5-mer: +PWYSK +``` + +For kmers with an `Alphabet` that implement `BioSequences.AsciiAlphabet`, they can also be constructed from `AbstractVector{UInt8}`, in which case the vector is interpreted as being bytes of ASCII text: + +```jldoctest +julia> AAKmer{3}([0x65, 0x67, 0x7a]) +AminoAcid 3-mer: +EGZ +``` + +When constructing from an `AbstractString` (or byte vector), uracil (`U`) and thymine `T` are treated differently - a `U` cannot be read as thymine: + +```jldoctest +julia> DNAKmer{3}("UAG") +ERROR: cannot encode 0x55 (Char 'U') in DNAAlphabet{2} +[...] +``` + +However, when constructing from a `BioSequence`, these nucleotides are considered +interchangeable: + +```jldoctest +julia> RNAKmer{4}(dna"TATC") +RNA 4-mer: +UAUC +``` + +Finally, kmers can be constructed with a string literal `@mer_str`, where the string must be appended with `d` for DNA, `r` for RNA, or `a` for amino acid: + +```jldoctest +julia> mer"UGCUGA"r +RNA 6-mer: +UGCUGA + +julia> mer"EDEHL"a +AminoAcid 5-mer: +EDEHL +``` + +Since the literals produce the kmer at parse time and inserts it directly into the parsed code, this will always be type stable, +and the overhead related to parsing the string will not be paid: + +```jldoctest; filter = r"(^\s+0\.\d+ seconds.+)|(^\d+$)" +julia> function count_aaas(dna) + x = 0 + for kmer in FwDNAMers{3}(dna) + # The parsing happens once here, when the + # code is parsed, and is fine to have in the loop + x += kmer == mer"AAA"d + end + x + end; + +julia> seq = randseq(DNAAlphabet{2}(), 100_000_000); +``` + +julia> @time count_aaas(seq) + 0.193463 seconds (32.05 k allocations: 2.051 MiB, 21.88% compilation time) +1563330 + +### Indexing +Kmers support most normal indexing, such as scalar indexing: + +```jldoctest +julia> mer"CAGCU"r[3] +RNA_G +``` + +Slicing + +```jldoctest +julia> mer"AGGCTA"d[2:5] +DNA 4-mer: +GGCT +``` + +And indexing with boolean vectors, and vectors of indices: + +```jldoctest +julia> m = mer"MDGKRY"a; + +julia> m[[true, false, true, true, false, true]] +AminoAcid 4-mer: +MGKY + +julia> m[[4,2]] +AminoAcid 2-mer: +KD +``` + +### A note on type stability +!!! warning + Except scalar indexing which always returns a single symbol, all the operations + above a _type unstable_, since the length (and thus type) of the resulting + kmer depends on the input value, not its type. + +However, type unstable functions may be type-stable, if the indexing value is +known at compile time, and the Julia compiler uses constant folding: + +```jldoctest +julia> f(x) = x[2:5]; # 2:5 is a compile time constant + +julia> Test.@inferred f(mer"UCGUAGC"r) +RNA 4-mer: +CGUA +``` + +### Reference +```@docs +Kmer +Mer +@mer_str +DNAKmer +RNAKmer +AAKmer +DNACodon +RNACodon +pop +pop_first +push +push_first +shift +shift_first +``` \ No newline at end of file diff --git a/docs/src/predicates.md b/docs/src/predicates.md deleted file mode 100644 index 2aebc49..0000000 --- a/docs/src/predicates.md +++ /dev/null @@ -1,18 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Predicates - -The following predicate functions from BioSequences.jl are compatible with `Kmer`s. -Some have an optimised method defined in Kmers.jl. - -```@docs -isrepetitive -ispalindromic -hasambiguity -iscanonical -``` \ No newline at end of file diff --git a/docs/src/random.md b/docs/src/random.md deleted file mode 100644 index 8a1990b..0000000 --- a/docs/src/random.md +++ /dev/null @@ -1,14 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Generating random sequences - -You can generate random kmers using `Base.rand` function. - -```@docs -Base.rand(::Type{<:Kmer}) -``` \ No newline at end of file diff --git a/docs/src/transforms.md b/docs/src/transforms.md deleted file mode 100644 index 853b10c..0000000 --- a/docs/src/transforms.md +++ /dev/null @@ -1,52 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Indexing & modifying kmers - -## Indexing - -As `BioSequence` concrete subtypes, kmers can be indexed using integers - -```jldoctest -julia> kmer = Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C) -DNA 5-mer: -TTAGC - -julia> kmer[3] -DNA_A -``` - -You can also slice Kmers using UnitRanges: - -```jldoctest -julia> kmer = Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C) -DNA 5-mer: -TTAGC - -julia> kmer[1:3] -DNA 3-mer: -TTA -``` - -!!! warning - Using slicing will introduce performance penalties in your code if - you pass values of `i` that are not constants that can be propagated. - -## Modifying sequences - -Many modifying operations that are possible for some `BioSequences` such as -`LongSequence` are not possible for `Kmer`s, this is primarily due to the fact -`Kmer`s are an immutable struct. - -However some non-mutating transformations are available: - -```@docs -BioSequences.complement(::Kmer) -Base.reverse(::Kmer) -BioSequences.reverse_complement(::Kmer) -canonical -``` \ No newline at end of file diff --git a/docs/src/translate.md b/docs/src/translate.md deleted file mode 100644 index d518556..0000000 --- a/docs/src/translate.md +++ /dev/null @@ -1,62 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Translating and reverse translating - -## Translating -Just like other `BioSequence`s, `Kmer`s of RNA or DNA alphabets can be efficiently translated to amino acids: - -``` -julia> kmer = RNAKmer("AUGGGCCACUGA"); - -julia> translate(kmer) -AminoAcid 4-mer: -MGH* -``` - -For more information on translation and different genetic codes, see the documentation of BioSequences.jl. - -## Reverse translation -Reverse translation (or "revtrans", for short) refers to the mapping from amino acids back to the set of RNA codons that code for the given amino acid, under a given genetic code. -There is no known natural process of revtrans, but it can be useful to do _in silico_. - -In Kmers.jl, revtrans is done through the `reverse_translate` function. -This takes an amino acid sequence and produces a `Vector{CodonSet}`, where `CodonSet <: AbstractSet{RNACodon}`. -Alternatively, it takes an amino acid and produces a `CodonSet`. - -A reverse genetic code can optionally be specified as the second argument. -If not provided, it default to the reverse standard genetic code. - -### Example of reverse translation -```julia -julia> reverse_translate(AA_W) # default to standard genetic code -Kmers.CodonSet with 1 element: - UGG - -julia> code = ReverseGeneticCode(BioSequences.trematode_mitochondrial_genetic_code); - -julia> reverse_translate(AA_W, code) -Kmers.CodonSet with 2 elements: - UGA - UGG -``` - -### Important notes on reverse translation -* `AA_Gap` cannot be reverse translated. Attempting so throws an error -* In cells, `AA_O` and `AA_U` are encoded by dynamic overloading of the codons `UAG` and `UGA`, respectively. - Because these codons normally code for `AA_Term`, the forward genetic code returns `AA_Term` for these codons. - However, we can unambiguously reverse translate them, so these amino acids translate to codonsets with these - precise codons. -* Ambiguous amino acids translate to the union of the possible amino acids. For example, if `AA_L` translate to set `S1`, - and `AA_I` translate to `S2`, then `AA_J` translate to `union(S1, S2)`. - -```@docs -Kmers.CodonSet -Kmers.ReverseGeneticCode -reverse_translate -reverse_translate! -``` diff --git a/docs/src/translation.md b/docs/src/translation.md new file mode 100644 index 0000000..4fa00cf --- /dev/null +++ b/docs/src/translation.md @@ -0,0 +1,20 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` + +### Reference +```@docs +reverse_translate +reverse_translate! +FwKmers +CanonicalKmers +UnambiguousKmers +CodonSet +ReverseGeneticCode +AbstractKmerIterator +``` \ No newline at end of file diff --git a/src/Kmers.jl b/src/Kmers.jl index 56a871c..cbb1704 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -25,6 +25,7 @@ export Kmer, shift, shift_first, pop, + pop_first, # Iterators FwKmers, @@ -124,12 +125,17 @@ export Kmer, RNAAlphabet, translate, complement, - reverse_complement + reverse_complement, + canonical, + iscanonical # Kmers.jl is tightly coupled to BioSequences and relies on much of its internals. # Hence, we do not care about carefully importing specific symbols using BioSequences +# This is a documented method, not internals +using Base: tail + """ Kmers.Unsafe diff --git a/src/construction.jl b/src/construction.jl index 3843b55..ce4a8fb 100644 --- a/src/construction.jl +++ b/src/construction.jl @@ -120,7 +120,7 @@ end byte = seq[i] encoding = BioSequences.ascii_encode(Alphabet(T), byte) if encoding > 0x7f - throw(BioSequences.EncodeError(Alphabet(T), repr(byte))) + throw(BioSequences.EncodeError(Alphabet(T), byte)) end (_, data) = leftshift_carry(data, bps, encoding % UInt) end diff --git a/src/iterators/CanonicalKmers.jl b/src/iterators/CanonicalKmers.jl index a37cd5a..f2c4344 100644 --- a/src/iterators/CanonicalKmers.jl +++ b/src/iterators/CanonicalKmers.jl @@ -15,7 +15,7 @@ Can be constructed more conventiently with the constructors `CanonicalDNAMers{K} # Examples: ```jldoctest julia> collect(CanonicalRNAMers{3}("AGCGA")) -3-element Vector{Kmer{RNAAlphabet{2}, 4, 1}}: +3-element Vector{Kmer{RNAAlphabet{2}, 3, 1}}: AGC CGC CGA @@ -26,7 +26,6 @@ struct CanonicalKmers{A <: NucleicAcidAlphabet, K, S} <: AbstractKmerIterator{A, end source_type(::Type{CanonicalKmers{A, K, S}}) where {A, K, S} = S -load_source(x::CanonicalKmers) = x.it.seq @inline Base.length(it::CanonicalKmers) = length(it.it) # Constructors diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index 1f5470b..3be5313 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -11,11 +11,11 @@ and similar also for `FwRNAMers` and `FwAAMers`. ```jldoctest julia> v = collect(FwDNAMers{3}("AGCGTATA")); -julia eltype(v), length(v) +julia> eltype(v), length(v) (Kmer{DNAAlphabet{2}, 3, 1}, 6) -julia> length(collect(FwRNAMers{3}(rna"UGDCUGAVC"))) -2 +julia> collect(FwRNAMers{3}(rna"UGCDUGAVC")) +ERROR: cannot encode D in RNAAlphabet{2} ``` """ struct FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} @@ -28,8 +28,7 @@ struct FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} end end -source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S # TODO: Can be deleted? -load_source(x::FwKmers) = x.seq # TODO: Can be deleted? Is it unused, here and other defs +source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S @inline function Base.length(it::FwKmers{A, K, S}) where {A, K, S} src = used_source(RecodingScheme(A(), S), it.seq) diff --git a/src/iterators/common.jl b/src/iterators/common.jl index 6e2b2fa..f975318 100644 --- a/src/iterators/common.jl +++ b/src/iterators/common.jl @@ -1,13 +1,17 @@ +# TODO: Make sure to go through this docstring """ AbstractKmerIterator{A <: Alphabet, K} +Abstract type for kmer iterators. The element type is `Kmer{A, K, N}`, +with the appropriately derived N. + Iterates `Kmer{A, K}`. Functions to implement: * `Base.iterate` Optional functions: * `source_type` -* `load_source` +* `Base.IteratorSize`, if not `HasLength` """ abstract type AbstractKmerIterator{A <: Alphabet, K} end @@ -22,13 +26,6 @@ Get the type of the data source that kmers are extracted from """ function source_type end -""" - load_source(x::AbstractKmerIterator)::source_type(typeof(x)) - -Get the data source from the kmer iterator. -""" -function load_source end - function used_source(R::RecodingScheme, s) if R isa AsciiEncode && s isa Union{String, SubString{String}} codeunits(s) diff --git a/src/kmer.jl b/src/kmer.jl index eb33a2b..89f2501 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -13,20 +13,20 @@ See also: [`DNAKmer`](@ref), [`RNAKmer`](@ref), [`AAKmer`](@ref), [`AbstractKmer # Examples ```jldoctest julia> RNAKmer{5}("ACGUC") -RNA 5-mer +RNA 5-mer: ACGUC julia> Kmer{DNAAlphabet{4}, 6}(dna"TGCTTA") -DNA 6-mer +DNA 6-mer: TGCTTA julia> AAKmer{5}((lowercase(i) for i in "KLWYR")) -AminoAcid 5-mer -TGCTTA +AminoAcid 5-mer: +KLWYR julia> RNAKmer{3}("UAUC") # wrong length ERROR: -[ ... ] +[...] ``` """ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} @@ -62,7 +62,7 @@ for the alphabat julia> mer"DEKR"a isa Mer{4} true -julia> DNAKmer{2}("TGATCA") isa Mer{6} +julia> DNAKmer{6}("TGATCA") isa Mer{6} true julia> RNACodon <: Mer{3} @@ -141,10 +141,10 @@ end @inline derive_type(::Type{Kmer{A, K}}) where {A, K} = Kmer{A, K, n_coding_elements(Kmer{A, K})} -zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), Val{nsize(T)}()) +@inline zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), Val{nsize(T)}()) # TODO: Should this somehow throw a MethodError if N is already parameterized? -function zero_kmer(T::Type{Kmer{A, K}}) where {A, K} +@inline function zero_kmer(T::Type{Kmer{A, K}}) where {A, K} T2 = derive_type(Kmer{A, K}) T2(unsafe, zero_tuple(T2)) end @@ -208,12 +208,12 @@ See also: [`push_first`](@ref), [`pop`](@ref), [`shift`](@ref) # Examples ```jldoctest -julia> shift(mer"UGCUGA"r, RNA_G) -RNA 7-mer +julia> push(mer"UGCUGA"r, RNA_G) +RNA 7-mer: UGCUGAG -julia> shift(mer"W"a, 'E') -AminoAcid 2-mer +julia> push(mer"W"a, 'E') +AminoAcid 2-mer: WE ``` """ @@ -245,11 +245,11 @@ See also: [`shift_first`](@ref), [`push`](@ref) # Examples ```jldoctest julia> shift(mer"TACC"d, DNA_A) -DNA 4-mer +DNA 4-mer: ACCA julia> shift(mer"WKYMLPIIRS"aa, 'F') -AminoAcid 10-mer +AminoAcid 10-mer: KYMLPIIRSF ``` """ @@ -259,10 +259,10 @@ function shift(kmer::Kmer{A}, s) where {A} end @inline function shift_encoding(kmer::Kmer, encoding::UInt) + isempty(kmer) && return kmer bps = BioSequences.bits_per_symbol(kmer) (_, new_data) = leftshift_carry(kmer.data, bps, encoding) - (head, tail...) = new_data - typeof(kmer)(unsafe, (head & get_mask(typeof(kmer)), tail...)) + typeof(kmer)(unsafe, (first(new_data) & get_mask(typeof(kmer)), Base.tail(new_data)...)) end """ @@ -280,12 +280,12 @@ See also: [`push`](@ref), [`pop`](@ref), [`shift`](@ref) # Examples ```jldoctest -julia> shift(mer"GCU"r, RNA_G) -RNA 4-mer +julia> push_first(mer"GCU"r, RNA_G) +RNA 4-mer: GGCU -julia> shift(mer"W"a, 'E') -AminoAcid 2-mer +julia> push_first(mer"W"a, 'E') +AminoAcid 2-mer: EW ``` """ @@ -298,10 +298,9 @@ function push_first(kmer::Kmer{A}, s) where {A} else kmer.data end - (head, tail...) = new_data encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) - head |= left_shift(encoding, (elements_in_head(newT) - 1) * bps) - newT(unsafe, (head, tail...)) + head = first(new_data) | left_shift(encoding, (elements_in_head(newT) - 1) * bps) + newT(unsafe, (head, tail(new_data)...)) end """ @@ -314,11 +313,11 @@ See also: [`shift`](@ref), [`push`](@ref) # Examples ```jldoctest julia> shift_first(mer"TACC"d, DNA_A) -DNA 4-mer +DNA 4-mer: ATAC julia> shift_first(mer"WKYMLPIIRS"aa, 'F') -AminoAcid 10-mer +AminoAcid 10-mer: FWKYMLPIIR ``` """ @@ -328,11 +327,12 @@ function shift_first(kmer::Kmer{A}, s) where {A} end function shift_first_encoding(kmer::Kmer{A}, encoding::UInt) where {A} + isempty(kmer) && return kmer bps = BioSequences.bits_per_symbol(A()) (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) - (head, tail...) = new_data - head |= left_shift(encoding, (elements_in_head(typeof(kmer)) - 1) * bps) - typeof(kmer)(unsafe, (head, tail...)) + head = + first(new_data) | left_shift(encoding, (elements_in_head(typeof(kmer)) - 1) * bps) + typeof(kmer)(unsafe, (head, tail(new_data)...)) end """ @@ -342,19 +342,19 @@ Returns a new kmer with the last symbol of the input `kmer` removed. Throws an `ArgumentError` if `kmer` is empty. !!! warn - Since the output of this function is a `K+1`-mer, use of this function + Since the output of this function is a `K-1`-mer, use of this function in a loop may result in type-instability. -See also: [`push`](@ref), [`shift`](@ref) +See also: [`pop_first`](@ref), [`push`](@ref), [`shift`](@ref) # Examples ```jldoctest julia> pop(mer"TCTGTA"d) -DNA 5-mer +DNA 5-mer: TCTGT julia> pop(mer"QPSY"a) -AminoAcid 3-mer +AminoAcid 3-mer: QPS julia> pop(mer""a) @@ -368,14 +368,54 @@ function pop(kmer::Kmer{A}) where {A} newT = derive_type(Kmer{A, length(kmer) - 1}) (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) new_data = if elements_in_head(typeof(kmer)) == 1 - (head, tail...) = new_data - tail + tail(new_data) else new_data end newT(unsafe, new_data) end +""" + pop_first(kmer::Kmer{A, K})::Kmer{A, K-1} + +Returns a new kmer with the first symbol of the input `kmer` removed. +Throws an `ArgumentError` if `kmer` is empty. + +!!! warn + Since the output of this function is a `K-1`-mer, use of this function + in a loop may result in type-instability. + +See also: [`pop`](@ref), [`push`](@ref), [`shift`](@ref) + +# Examples +```jldoctest +julia> pop_first(mer"TCTGTA"d) +DNA 5-mer: +CTGTA + +julia> pop_first(mer"QPSY"a) +AminoAcid 3-mer: +PSY + +julia> pop_first(mer""a) +ERROR: ArgumentError: +[...] +``` +""" +function pop_first(kmer::Kmer{A}) where {A} + isempty(kmer) && throw(ArgumentError("Cannot pop 0-mer")) + data = if elements_in_head(typeof(kmer)) == 1 + tail(kmer.data) + else + bps = BioSequences.bits_per_symbol(A()) + bits_used = 8 * sizeof(UInt) - (bits_unused(typeof(kmer)) + bps) + mask = left_shift(UInt(1), bits_used) - UInt(1) + (first(kmer.data) & mask, tail(kmer.data)...) + end + newT = derive_type(Kmer{A, length(kmer) - 1}) + newT(unsafe, data) +end + # Get a mask 0x0001111 ... masking away the unused bits of the head element # in the UInt tuple @inline function get_mask(T::Type{<:Kmer}) diff --git a/src/revtrans.jl b/src/revtrans.jl index 55b9c71..ed4c381 100644 --- a/src/revtrans.jl +++ b/src/revtrans.jl @@ -18,7 +18,7 @@ julia> Set(CodonSet(v)) == Set(v) true julia> union(CodonSet(v), CodonSet([mer"GAG"r])) -Kmers.CodonSet with 4 elements: +CodonSet with 4 elements: GAG GGA UAG @@ -74,7 +74,7 @@ inverse of the mapping through `GeneticCode` julia> code = ReverseGeneticCode(BioSequences.candidate_division_sr1_genetic_code); julia> code[AA_E] -Kmers.CodonSet with 2 elements: +CodonSet with 2 elements: GAA GAG @@ -147,8 +147,11 @@ See also: [`reverse_translate`](@ref) julia> v = CodonSet[]; julia> reverse_translate!(v, aa"KWCL") -4-element Vector{CodonSet} -[...] +4-element Vector{CodonSet}: + CodonSet(0x0000000000000005) + CodonSet(0x0400000000000000) + CodonSet(0x0a00000000000000) + CodonSet(0x50000000f0000000) ``` """ function reverse_translate!(v::Vector{CodonSet}, seq::AASeq, code=rev_standard_genetic_code) @@ -169,21 +172,20 @@ If `s` is an `AASeq`, return `Vector{CodonSet}`. # Examples ```jldoctest julia> reverse_translate(AA_W) -Kmers.CodonSet with 1 element: +CodonSet with 1 element: UGG julia> v = reverse_translate(aa"MMLVQ"); julia> typeof(v) -Vector{Kmers.CodonSet} +Vector{CodonSet} (alias for Array{CodonSet, 1}) julia> v[4] -Kmers.CodonSet with 4 elements: +CodonSet with 4 elements: GUA GUC GUG GUU -[...] ``` See also: [`reverse_translate!`](@ref), [`ReverseGeneticCode`](@ref) diff --git a/src/transformations.jl b/src/transformations.jl index 6ea7ad5..bb51127 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -21,8 +21,7 @@ end function BioSequences.complement(x::Kmer{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}) isempty(x) && return x data = map(i -> BioSequences.complement_bitpar(i, Alphabet(x)), x.data) - (head, tail...) = data - typeof(x)(unsafe, ((head & get_mask(typeof(x))), tail...)) + typeof(x)(unsafe, ((first(data) & get_mask(typeof(x))), Base.tail(data)...)) end # Generic fallback diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index 659e147..bfbc4de 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -111,10 +111,10 @@ end nbits::Integer, carry::T, ) where {T <: Unsigned} - head, tail... = x - (new_carry, new_tail) = leftshift_carry(tail, nbits, carry) - new_head = left_shift(head, nbits) | new_carry - (left_carry(head, nbits), (new_head, new_tail...)) + isempty(x) && return x + (new_carry, new_tail) = leftshift_carry(tail(x), nbits, carry) + new_head = left_shift(first(x), nbits) | new_carry + (left_carry(first(x), nbits), (new_head, new_tail...)) end @inline function rightshift_carry( @@ -122,11 +122,11 @@ end nbits::Integer, carry::T, ) where {T <: Unsigned} - head, tail... = x - new_head = right_shift(head, nbits) | right_carry(carry, nbits) + isempty(x) && return x + new_head = right_shift(first(x), nbits) | right_carry(carry, nbits) mask = left_shift(UInt(1), nbits) - 1 - tail_carry = head & mask - (new_carry, new_tail) = rightshift_carry(tail, nbits, tail_carry) + tail_carry = first(x) & mask + (new_carry, new_tail) = rightshift_carry(tail(x), nbits, tail_carry) (new_carry, (new_head, new_tail...)) end From c92db183b99bc48d04d01e72ef68db70e63a9a7c Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 30 Dec 2023 21:24:50 +0100 Subject: [PATCH 23/33] Add rest of the docs --- docs/make.jl | 6 +- docs/src/faq.md | 39 +++++++ docs/src/iteration.md | 98 +++++++++++++++++ docs/src/kmers.md | 5 +- docs/src/translation.md | 37 +++++-- src/Kmers.jl | 8 +- src/iterators/CanonicalKmers.jl | 3 + src/iterators/FwKmers.jl | 6 ++ src/iterators/SpacedKmers.jl | 171 ++++++++++++++++++++++++++++++ src/iterators/UnambiguousKmers.jl | 4 +- 10 files changed, 363 insertions(+), 14 deletions(-) create mode 100644 docs/src/faq.md create mode 100644 docs/src/iteration.md create mode 100644 src/iterators/SpacedKmers.jl diff --git a/docs/make.jl b/docs/make.jl index 8c5d80c..cd5d0d2 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,11 +14,9 @@ makedocs(; pages=[ "Home" => "index.md", "The Kmer type" => "kmers.md", + "Iteration" => "iteration.md", "Translation" => "translation.md", - # The kmer type (construction, indexing) - # Kmer iteration - # Translation (revtrans also) - # FAQ (why not compare to bioseq, why no unambig canonical) + "FAQ" => "faq.md", ], authors="Jakob Nybo Nissen, Sabrina J. Ward, The BioJulia Organisation and other contributors.", checkdocs=:exports, diff --git a/docs/src/faq.md b/docs/src/faq.md new file mode 100644 index 0000000..fd2590e --- /dev/null +++ b/docs/src/faq.md @@ -0,0 +1,39 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` +## FAQ +### Why can kmers not be compared to biosequences? +It may be surprising that kmers cannot be compared to other biosequences: + +```jldoctest +julia> dna"TAG" == mer"TAG"d +ERROR: MethodError +[...] +``` + +In fact, this is implemented by a manually thrown `MethodError`; the generic case `Base.:==(::BioSequence, ::BioSequence)` is defined. + +The reason for this is the consequence of the following limitations: +* `isequal(x, y)` implies `hash(x) == hash(y)` +* `isqual(x, y)` and `x == y` ought to be identical for well-defined elements (i.e. in the absence of `missing`s and `NaN`s etc.) +* `hash(::Kmer)` must be absolutely maximally efficient + +If kmers were to be comparable to `BioSequence`, then the hashing of `BioSequence` should follow `Kmer`, which practically speaking would mean that all biosequences would need to be recoded to `Kmer`s before hashing. + +### Why isn't there an iterator of unambiguous, canonical kmers or spaced, canonical kmers? +Any iterator of nucleotide kmers can be made into a canonical kmer iterator by simply calling `canonical` on its output kers. + +The `CanonicalKmers` iterator is special cased, because with a step size of 1, it is generally faster to build the next kmer by storing both the reverse and forward kmer, then creating the next kmer by prepending/append the next symbol. + +However, with a larger step size, it becomes more efficient to build the forward kmer, then reverse-complement the whole kmer. + +### Why isn't there an iterator of skipmers/minimizers/k-min-mers, etc? +The concept of kmers have turned out to be remarkably flexible and useful in bioinformatics, and have spawned a neverending stream of variations. + +We simply can't implement them all. +However, we hope to make it relatively easy to implement custom kmer iterators for downstream users. \ No newline at end of file diff --git a/docs/src/iteration.md b/docs/src/iteration.md new file mode 100644 index 0000000..8dfa0de --- /dev/null +++ b/docs/src/iteration.md @@ -0,0 +1,98 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` +## Iteration +Most applications of kmers extract multiple kmers from an underlying sequence. +To facilitate this, Kmers.jl implements a few various basic kmer iterators which are all subtypes of `AbstractKmerIterator`. + +The underlying sequence can be a `BioSequence`, `AbstractString`, or `AbstractVector{UInt8}`. +In the latter case, if the alphabet of the element type implements `BioSequences.AsciiAlphabet`, the vector will be treated a vector of ASCII characters. + +Similarly to the rules when constructing kmers directly, DNA and RNA is treated interchangeably when the underlying sequence is a `BioSequence`, but when the underlying sequence is a string or bytevector, `U` and `T` are considered different, and e.g. uracil cannot be constructed from a sequence containing `T`: + +```jldoctest +julia> only(FwDNAMers{3}(rna"UGU")) +DNA 3-mer: +TGT + +julia> only(FwDNAMers{3}("UGU")) +ERROR: +[...] +``` + +The following kmer iterators are implemented: + +### `FwKmers` +The most basic kmer iterator is `FwKmers`, which simply iterates every kmer, in order: + +```@docs +FwKmers +FwDNAMers +FwRNAMers +FwAAMers +``` + +### `CanonicalKmers` +This iterator is similar to [`FwKmers`](@ref), however, for each `Kmer` encountered, it returns the _canonical_ kmer. + +The canonical kmer is defined as the lexographically smaller of a kmer and its reverse complement. +That is, if [`FwKmers`](@ref) would iterate `TCAC`, then [`CanonicalKmers`](@ref) would return `GTGA`, as this is the reverse complement of `TCAC`, and is before `TCAC` in the alphabet. + +[`CanonicalKmers`](@ref) is useful for summarizing the kmer composition of sequences whose strandedness is unknown. + +```@docs +CanonicalKmers +CanonicalDNAMers +CanonicalRNAMers +``` + +### `UnambiguousKmers` +[`UnambiguousKmers`](@ref) iterates unambiguous nucleotides (that is, kmers of the alphabets `DNAAlphabet{2}` or `RNAAlphabet{2}`). +Any kmers containing [ambiguous nucleotides](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC341218/) such as `W` or `N` are skipped. + +```@docs +UnambiguousKmers +UnambiguousDNAMers +UnambiguousRNAMers +``` + +### `SpacedKmers` +The [`SpacedKmers`](@ref) iterator iterates kmers with a fixed step size between k-mers. +For example, for a K of 4, and a step size of 3, the output kmers would overlap with a single nucleotide, like so: + +``` +seq: TGATGCGTAGTG + TGCT + TGCG + GTAG +``` + +Hence, if `FwKmers` are analogous to `UnitRange`, `SpacedKmers` is analogous to `StepRange`. + +```@docs +SpacedKmers +SpacedDNAMers +SpacedRNAMers +SpacedAAMers +``` + +The convenience functions [`each_dna_codon`](@ref) and [`each_rna_codon`](@ref) return `SpacedKmers` with a K value of 3 and step size of 3: + +```@docs +each_dna_codon +each_rna_codon +``` + +## The `AbstractKmerIterator` interface +It's very likely that users of Kmers.jl need to implement their own custom kmer iterators, in which case they should subtype [`AbstractKmerIterator`](@ref). + +```@docs +AbstractKmerIterator +``` + +There is no real interface implemented for this abstract type, other than that `AbstractKmerIterator{A, K}` needs to iterate `Kmer{A, K}`. diff --git a/docs/src/kmers.md b/docs/src/kmers.md index c069742..f78d2f3 100644 --- a/docs/src/kmers.md +++ b/docs/src/kmers.md @@ -102,7 +102,7 @@ EDEHL Since the literals produce the kmer at parse time and inserts it directly into the parsed code, this will always be type stable, and the overhead related to parsing the string will not be paid: -```jldoctest; filter = r"(^\s+0\.\d+ seconds.+)|(^\d+$)" +```jldoctest; filter = [r"^\s*0\.\d+ seconds.+"s, r"^\d+"s] julia> function count_aaas(dna) x = 0 for kmer in FwDNAMers{3}(dna) @@ -114,11 +114,12 @@ julia> function count_aaas(dna) end; julia> seq = randseq(DNAAlphabet{2}(), 100_000_000); -``` julia> @time count_aaas(seq) 0.193463 seconds (32.05 k allocations: 2.051 MiB, 21.88% compilation time) 1563330 +``` + ### Indexing Kmers support most normal indexing, such as scalar indexing: diff --git a/docs/src/translation.md b/docs/src/translation.md index 4fa00cf..d9cefd9 100644 --- a/docs/src/translation.md +++ b/docs/src/translation.md @@ -7,14 +7,39 @@ DocTestSetup = quote end ``` -### Reference +## Translation +`Kmer`s can be translated using the `translate` function exported by `BioSequences`: + +```jldoctest +julia> translate(mer"UGCUUGAUC"r) +AminoAcid 3-mer: +CLI +``` + +Since `Kmer`s are immutable, the in-place `translate!` function is not implemented for `Kmers`. +Also, remember that `Kmer`s are only efficient when short (at most a few hundred symbols). Hence, entire exons or genes should probably be represented by `LongSequence` or `LongSubSeq`. + +### Reverse translation +Kmers.jl implements reverse translation, in which an amino acid sequence is translated to an RNA sequence. +While this process doesn't occur naturally (as far as we know), it is still useful for some analyses. + +Since genetic codes are degenerate, i.e. multiple codons code for the same amino acid, reverse translating a sequence does not return a nucleic acid sequence, but a vector of `CodonSet`: + ```@docs reverse_translate -reverse_translate! -FwKmers -CanonicalKmers -UnambiguousKmers CodonSet +``` + +`CodonSet` is an efficiently implemented `AbstractSet{RNACodon}` (and remember, `RNACodon` is an alias for `RNAKmer{3, 1}`). + +To avoid allocating a new `Vector`, you can use `reverse_translate!`: + +```@docs +reverse_translate! +``` + +Both functions take a genetic code as a keyword argument of the type `ReverseGeneticCode`. This object determines the mapping from amino acid to `CodonSet` - by default the [standard genetic code](https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables#Standard_RNA_codon_table) is used - this mapping is used by nearly all organisms: + +```@docs ReverseGeneticCode -AbstractKmerIterator ``` \ No newline at end of file diff --git a/src/Kmers.jl b/src/Kmers.jl index cbb1704..9dc31f0 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -38,6 +38,12 @@ export Kmer, UnambiguousKmers, UnambiguousDNAMers, UnambiguousRNAMers, + SpacedKmers, + SpacedDNAMers, + SpacedRNAMers, + SpacedAAMers, + each_dna_codon, + each_rna_codon, # Reverse translation CodonSet, @@ -162,6 +168,6 @@ include("iterators/common.jl") include("iterators/FwKmers.jl") include("iterators/CanonicalKmers.jl") include("iterators/UnambiguousKmers.jl") -#include("iterators/SpacedKmers.jl") +include("iterators/SpacedKmers.jl") end # module diff --git a/src/iterators/CanonicalKmers.jl b/src/iterators/CanonicalKmers.jl index f2c4344..075d7e7 100644 --- a/src/iterators/CanonicalKmers.jl +++ b/src/iterators/CanonicalKmers.jl @@ -36,7 +36,10 @@ function CanonicalKmers{A, K, S}(s::S) where {S, A <: NucleicAcidAlphabet, K} CanonicalKmers{A, K, S}(FwKmers{A, K}(s)) end +"`CanonicalDNAMers{K, S}`: Alias for `CanonicalKmers{DNAAlphabet{2}, K, S}`" const CanonicalDNAMers{K, S} = CanonicalKmers{DNAAlphabet{2}, K, S} + +"`CanonicalRNAMers{K, S}`: Alias for `CanonicalKmers{RNAAlphabet{2}, K, S}`" const CanonicalRNAMers{K, S} = CanonicalKmers{RNAAlphabet{2}, K, S} @inline function Base.iterate(it::CanonicalKmers{A, K, S}, state...) where {A, K, S} diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index 3be5313..3fb1d5a 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -38,8 +38,13 @@ end # Constructors FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)}(s) +"`FwDNAMers{K, S}`: Alias for `FwKmers{DNAAlphabet{2}, K, S}`" const FwDNAMers{K, S} = FwKmers{DNAAlphabet{2}, K, S} + +"`FwRNAMers{K, S}`: Alias for `FwKmers{RNAAlphabet{2}, K, S}`" const FwRNAMers{K, S} = FwKmers{RNAAlphabet{2}, K, S} + +"`FwAAMers{K, S}`: Alias for `FwKmers{AminoAcidAlphabet, K, S}`" const FwAAMers{K, S} = FwKmers{AminoAcidAlphabet, K, S} # TODO: Should this go in common? @@ -98,6 +103,7 @@ end (kmer, i) = state i > length(it.seq) && return nothing encoding = UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt + # TODO: Abstract this into a function if count_ones(encoding) != 1 throw( BioSequences.EncodeError( diff --git a/src/iterators/SpacedKmers.jl b/src/iterators/SpacedKmers.jl new file mode 100644 index 0000000..6ace5af --- /dev/null +++ b/src/iterators/SpacedKmers.jl @@ -0,0 +1,171 @@ +""" + SpacedKmers{A <: Alphabet, K, J, S} + +Iterator of kmers with step size. `J` signifies the step size, `S` +the type of the underlying sequence, and the eltype of the iterator +is `Kmer{A, K, N}` with the appropriate `N` + +See also: [`each_dna_codon`](@ref), [`FwKmers`](@ref) + +# Examples: +```jldoctest +julia> collect(SpacedDNAMers{3, 2}("AGCGTATA")) +3-element Vector{Kmer{DNAAlphabet{2}, 3, 1}}: + AGC + CGT + TAT +``` +""" +struct SpacedKmers{A <: Alphabet, K, J, S} <: AbstractKmerIterator{A, K} + seq::S + + function SpacedKmers{A, K, J, S}(seq::S) where {A, K, J, S} + K isa Int || error("K must be an Int") + K > 0 || error("K must be at least 1") + J isa Int || error("J must be an Int") + J > 0 || error("J must be at least 1") + new{A, K, J, S}(seq) + end +end + +source_type(::Type{SpacedKmers{A, K, J, S}}) where {A, K, J, S} = S +stepsize(::SpacedKmers{A, K, J}) where {A, K, J} = J + +@inline function Base.length(it::SpacedKmers{A, K, J}) where {A, K, J} + src = used_source(RecodingScheme(A(), source_type(typeof(it))), it.seq) + L = length(src) + L < K ? 0 : div((L - K), J) + 1 +end + +SpacedKmers{A, K, J}(s) where {A <: Alphabet, K, J} = SpacedKmers{A, K, J, typeof(s)}(s) + +"`SpacedDNAMers{K, J, S}`: Alias for `SpacedKmers{DNAAlphabet{2}, K, J, S}`" +const SpacedDNAMers{K, J, S} = SpacedKmers{DNAAlphabet{2}, K, J, S} + +"`SpacedRNAMers{K, J, S}`: Alias for `SpacedKmers{RNAAlphabet{2}, K, J, S}`" +const SpacedRNAMers{K, J, S} = SpacedKmers{RNAAlphabet{2}, K, J, S} + +"`SpacedAAMers{K, J, S}`: Alias for `SpacedKmers{AminoAcidAlphabet, K, J, S}`" +const SpacedAAMers{K, J, S} = SpacedKmers{AminoAcidAlphabet, K, J, S} + +# TODO: Do we need two function names for this?... +# Could it be each_codon(DNA, s) <- requires RNA/DNA to be exported +# each(DNACodon) <- awkward since DNACodon is merely an alias +""" + each_dna_codon(s) + +Construct an iterator of `DNACodon` from `s`, iterating over every `DNACodon` +in `s`, in-frame, i.e. with a step size of 3. + +See also: [`SpacedKmers`](@ref) + +Examples: +```jldoctest +julia> collect(each_dna_codon("TGACGATCGAC")) +3-element Vector{Kmer{DNAAlphabet{2}, 3, 1}}: + TGA + CGA + TCG +``` +""" +@inline each_dna_codon(s) = SpacedDNAMers{3, 3}(s) + +"The `RNA` equivalent of [`each_dna_codon`](@ref)" +@inline each_rna_codon(s) = SpacedRNAMers{3, 3}(s) + +@inline function Base.iterate(it::SpacedKmers{A}, state...) where {A} + iterate_kmer(RecodingScheme(A(), source_type(typeof(it))), it, state...) +end + +# TODO: Maybe in all kmer iterators, instantiate it with the source type, +# so we don't have to get the source type in functions (and thus +# it is allwoed to be a costly operation). +# However, this means we instantiate e.g. a FwKmers{A, K, S} and change S +# in the source type in the constructor +@inline function iterate_kmer( + R::RecodingScheme, + it::SpacedKmers{A, K}, +) where {A <: Alphabet, K} + length(it.seq) < ksize(eltype(it)) && return nothing + kmer = unsafe_extract( + R, + eltype(it), + used_source(RecodingScheme(A(), source_type(typeof(it))), it.seq), + 1, + ) + next_index = 1 + max(stepsize(it), ksize(eltype(it))) + (kmer, (kmer, next_index)) +end + +# Here, we need to convert to an abstractvector +# TODO: This function and the one above can be merged with the FwKmers one? +@inline function iterate_kmer( + R::AsciiEncode, + it::SpacedKmers{A, K, J, S}, +) where {A <: Alphabet, K, J, S <: Bytes} + src = used_source(RecodingScheme(A(), S), it.seq) + Base.require_one_based_indexing(src) + length(src) < ksize(eltype(it)) && return nothing + kmer = unsafe_extract(R, eltype(it), src, 1) + next_index = 1 + max(stepsize(it), ksize(eltype(it))) + (kmer, (kmer, next_index)) +end + +@inline function iterate_kmer(::RecodingScheme, it::SpacedKmers{A, K, J, S}, state) where {A, K, S, J} + src = used_source(RecodingScheme(A(), S), it.seq) + R = RecodingScheme(A(), S) + Base.require_one_based_indexing(src) + (kmer, i) = state + i > lastindex(src) - min(K, J) + 1 && return nothing + next_i = i + min(K, J) + # This branch should be resolved statically + if J ≥ K + kmer = unsafe_extract(R, eltype(it), src, i) + else + for _ in 1:J + kmer = update_kmer(R, it, kmer, i) + i += 1 + end + end + (kmer, (kmer, next_i)) +end + +# TODO: Can this function be used more generically, by the other iterators? +# I.e. this simply fetches a single element +@inline function update_kmer(::GenericRecoding, it::SpacedKmers, kmer::Kmer, i::Int) + symbol = @inbounds it.seq[i] + shift(kmer, convert(eltype(kmer), symbol)) +end + +@inline function update_kmer(::Copyable, it::SpacedKmers, kmer::Kmer, i::Int) + shift_encoding(kmer, UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt) +end + +@inline function update_kmer(::TwoToFour, it::SpacedKmers, kmer::Kmer, i::Int) + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(it.seq, i))) + shift_encoding(kmer, encoding) +end + +@inline function update_kmer(::FourToTwo, it::SpacedKmers, kmer::Kmer, i::Int) + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt + if count_ones(encoding) != 1 + throw( + BioSequences.EncodeError( + Alphabet(kmer), + reinterpret(eltype(it.seq), encoding % UInt8), + ), + ) + end + shift_encoding(kmer, trailing_zeros(encoding) % UInt) +end + +@inline function update_kmer(::AsciiEncode, it::SpacedKmers, kmer::Kmer, i::Int) + src = used_source(RecodingScheme(Alphabet(eltype(it)), source_type(typeof(it))), it.seq) + Base.require_one_based_indexing(src) + byte = @inbounds src[i] + encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), byte) + if encoding > 0x7f + throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) + end + shift_encoding(kmer, encoding % UInt) +end diff --git a/src/iterators/UnambiguousKmers.jl b/src/iterators/UnambiguousKmers.jl index a4bad4a..57b2aaa 100644 --- a/src/iterators/UnambiguousKmers.jl +++ b/src/iterators/UnambiguousKmers.jl @@ -29,7 +29,6 @@ end Base.IteratorSize(::Type{<:UnambiguousKmers}) = Base.SizeUnknown() source_type(::Type{UnambiguousKmers{A, K, S}}) where {A, K, S} = S -load_source(x::UnambiguousKmers) = x.it.seq # Constructors function UnambiguousKmers{A, K}(s::S) where {S, A <: TwoBit, K} @@ -39,7 +38,10 @@ function UnambiguousKmers{A, K, S}(s::S) where {S, A <: TwoBit, K} UnambiguousKmers{A, K, S}(FwKmers{A, K}(s)) end +"`UnambiguousDNAMers{K, S}`: Alias for `UnambiguousKmers{DNAAlphabet{2}, K, S}`" const UnambiguousDNAMers{K, S} = UnambiguousKmers{DNAAlphabet{2}, K, S} + +"`UnambiguousRNAMers{K, S}`: Alias for `UnambiguousKmers{RNAAlphabet{2}, K, S}`" const UnambiguousRNAMers{K, S} = UnambiguousKmers{RNAAlphabet{2}, K, S} @inline function Base.iterate(it::UnambiguousKmers{A, K, S}) where {A, K, S} From d91661691d11d1ff48644d4585c60b2052d9a2d7 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sat, 30 Dec 2023 22:20:51 +0100 Subject: [PATCH 24/33] Remove old iterators --- Project.toml | 9 +- src/old_iter/AbstractKmerIterator.jl | 51 -------- src/old_iter/EveryCanonicalKmer.jl | 150 ------------------------ src/old_iter/EveryKmer.jl | 168 --------------------------- src/old_iter/SpacedCanonicalKmers.jl | 168 --------------------------- src/old_iter/SpacedKmers.jl | 150 ------------------------ 6 files changed, 6 insertions(+), 690 deletions(-) delete mode 100644 src/old_iter/AbstractKmerIterator.jl delete mode 100644 src/old_iter/EveryCanonicalKmer.jl delete mode 100644 src/old_iter/EveryKmer.jl delete mode 100644 src/old_iter/SpacedCanonicalKmers.jl delete mode 100644 src/old_iter/SpacedKmers.jl diff --git a/Project.toml b/Project.toml index 160e09d..4f8249f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,10 @@ name = "Kmers" uuid = "445028e4-d31f-4f27-89ad-17affd83fc22" -authors = ["Sabrina Jaye Ward "] -version = "0.1.0" +authors = [ + "Jakob Nybo Nissen ", + "Sabrina Jaye Ward " +] +version = "1.0.0" [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" @@ -17,4 +20,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [targets] -test = ["Test", "Random"] +test = ["Test", "Random", "BioSequences"] diff --git a/src/old_iter/AbstractKmerIterator.jl b/src/old_iter/AbstractKmerIterator.jl deleted file mode 100644 index 4e20420..0000000 --- a/src/old_iter/AbstractKmerIterator.jl +++ /dev/null @@ -1,51 +0,0 @@ -### -### Kmer Iteration -### -### Abstract Kmer Iterator type. -### -### This file is a part of BioJulia. -### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md - -### Type for storing the result of Kmer iteration. - -abstract type AbstractKmerIterator{T <: Kmer, S <: BioSequence} end - -@inline Base.eltype(::Type{<:AbstractKmerIterator{T, S}}) where {T, S} = Tuple{UInt64, T} - -@inline Base.IteratorSize( - ::Type{<:AbstractKmerIterator{Kmer{A, K, N}, S}}, -) where {A, S <: BioSequence{A}, K, N} = Base.HasLength() -@inline Base.IteratorSize( - ::Type{<:AbstractKmerIterator{Kmer{A, K, N}, S}}, -) where {A, B, S <: BioSequence{B}, K, N} = Base.SizeUnknown() - -@inline function Base.length( - it::AbstractKmerIterator{Kmer{A, K, N}, S}, -) where {A, K, N, S <: BioSequence{A}} - return max(0, fld(it.stop - it.start + 1 - K, step(it)) + 1) -end - -# Iteration where the Kmer and Seq alphabets match: - -## Initial iteration without state. -@inline function Base.iterate( - it::AbstractKmerIterator{Kmer{A, K, N}, LongSequence{A}}, -) where {A, K, N} - fwkmer = _build_kmer_data(Kmer{A, K, N}, it.seq, 1) - if isnothing(fwkmer) - return nothing - else - # Get the reverse. - alph = Alphabet(Kmer{A, K, N}) - rshift = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(alph) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry( - _reverse( - BioSequences.BitsPerSymbol(alph), - _complement_bitpar(alph, fwkmer...)..., - ), - rshift, - ) - return KmerAt{Kmer{A, K, N}}(1, Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer)), - (K, fwkmer, rvkmer) - end -end diff --git a/src/old_iter/EveryCanonicalKmer.jl b/src/old_iter/EveryCanonicalKmer.jl deleted file mode 100644 index 321437e..0000000 --- a/src/old_iter/EveryCanonicalKmer.jl +++ /dev/null @@ -1,150 +0,0 @@ -""" - EveryCanonicalKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -An iterator over every canonical valid overlapping `T<:Kmer` in a given longer -`BioSequence`, between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by K - 1 bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not reliably be K - 1, - and the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct EveryCanonicalKmer{T <: Kmer, S <: BioSequence{<:NucleicAcidAlphabet}} <: - AbstractKmerIterator{T, S} - seq::S - start::Int - stop::Int - - function EveryCanonicalKmer{T, S}( - seq::S, - start::Int=firstindex(seq), - stop::Int=lastindex(seq), - ) where {T <: Kmer, S <: BioSequence} - T′ = kmertype(T) - checkmer(T′) # Should inline and constant fold. - return new{T′, S}(seq, start, stop) - end -end - -""" - EveryCanonicalKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -Convenience outer constructor so you don't have to specify `S` along with `T`. - -E.g. Instead of `EveryCanonicalKmer{DNACodon,typeof(s)}(s)`, you can just use `EveryCanonicalKmer{DNACodon}(s)` -""" -function EveryCanonicalKmer{T}( - seq::S, - start=firstindex(seq), - stop=lastindex(seq), -) where {T <: Kmer, S <: BioSequence} - return EveryCanonicalKmer{T, S}(seq, start, stop) -end - -""" - EveryCanonicalKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - -Convenience outer constructor so yyou don't have to specify full `Kmer` typing. - -In order to deduce `Kmer{A,K,N}`, `A` is taken from the input `seq` type, `K` is -taken from `::Val{K}`, and `N` is deduced using `A` and `K`. - -E.g. Instead of `EveryCanonicalKmer{DNAKmer{3,1}}(s)`, or `EveryCanonicalKmer{DNACodon}(s)`, -you can use `EveryCanonicalKmer(s, Val(3))` -""" -function EveryCanonicalKmer( - seq::BioSequence{A}, - ::Val{K}, - start=firstindex(seq), - stop=lastindex(seq), -) where {A, K} - return EveryCanonicalKmer{Kmer{A, K}}(seq, start, stop) -end - -Base.step(x::EveryCanonicalKmer) = 1 - -## Initial iteration without state. -@inline function Base.iterate( - it::EveryCanonicalKmer{Kmer{A, K, N}, LongSequence{A}}, -) where {A, K, N} - fwkmer = _build_kmer_data(Kmer{A, K, N}, it.seq, it.start) - if isnothing(fwkmer) - return nothing - else - rshift = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry( - _reverse( - BioSequences.BitsPerSymbol(A()), - _complement_bitpar(A(), fwkmer...)..., - ), - rshift, - ) - return (it.start, Kmer{A, K, N}(min(fwkmer, rvkmer))), - (it.start + K - 1, fwkmer, rvkmer) - end -end - -@inline function Base.iterate( - it::EveryCanonicalKmer{Kmer{A, K, N}, LongSequence{A}}, - state, -) where {A, K, N} - i, fwkmer, rvkmer = state - i += 1 - if i > it.stop - return nothing - else - bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - rshift = (64 - (n_unused(Kmer{A, K, N}) + 1) * bps) # Based on type info, should constant fold. - mask = (one(UInt64) << bps) - one(UInt64) # Based on type info, should constant fold. - - fbits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) - rbits = (BioSequences.complement_bitpar(fbits, A()) & mask) << rshift - fwkmer = leftshift_carry(fwkmer, bps, fbits) - rvkmer = rightshift_carry(rvkmer, bps, rbits) - pos = i - K + 1 - return (pos, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), (i, fwkmer, rvkmer) - end -end - -@inline Base.IteratorSize( - ::Type{<:EveryCanonicalKmer{Kmer{A, N, K}, LongSequence{B}}}, -) where {A <: NucleicAcidAlphabet{2}, N, K, B <: NucleicAcidAlphabet{4}} = - Base.SizeUnknown() - -@inline function Base.iterate( - it::EveryCanonicalKmer{Kmer{A, K, N}, LongSequence{B}}, - state=(it.start - 1, 1, blank_ntuple(Kmer{A, K, N}), blank_ntuple(Kmer{A, K, N})), -) where {A <: NucleicAcidAlphabet{2}, B <: NucleicAcidAlphabet{4}, K, N} - i, filled, fwkmer, rvkmer = state - i += 1 - filled -= 1 - - rshift = (64 - (n_unused(Kmer{A, K, N}) + 1) * 2) # Based on type info, should constant fold. - mask = (one(UInt64) << 2) - one(UInt64) # Based on type info, should constant fold. - - while i ≤ it.stop - @inbounds nt = reinterpret(UInt8, it.seq[i]) - @inbounds fbits = kmerbits[nt + 1] - rbits = (BioSequences.complement_bitpar(fbits, A()) & mask) << rshift - fwkmer = leftshift_carry(fwkmer, 2, fbits) - rvkmer = rightshift_carry(rvkmer, 2, rbits) - filled = ifelse(fbits == UInt64(0xff), 0, filled + 1) - if filled == K - return (i - K + 1, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), - (i, filled, fwkmer, rvkmer) - end - i += 1 - end - return nothing -end diff --git a/src/old_iter/EveryKmer.jl b/src/old_iter/EveryKmer.jl deleted file mode 100644 index 115b41d..0000000 --- a/src/old_iter/EveryKmer.jl +++ /dev/null @@ -1,168 +0,0 @@ -### -### Kmer Iteration -### -### Iterator type over every kmer in a sequence - overlapping. -### -### This file is a part of BioJulia. -### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md - -""" - EveryKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -An iterator over every valid overlapping `T<:Kmer` in a given longer -`BioSequence` between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by K - 1 bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not reliably be K - 1, - and the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct EveryKmer{T <: Kmer, S <: BioSequence} <: AbstractKmerIterator{T, S} - seq::S - start::Int - stop::Int - - function EveryKmer{T, S}( - seq::S, - start::Int=firstindex(seq), - stop::Int=lastindex(seq), - ) where {T <: Kmer, S <: BioSequence} - T′ = kmertype(T) - checkmer(T′) # Should inline and constant fold. - return new{T′, S}(seq, start, stop) - end -end - -""" - EveryKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -Convenience outer constructor so you don't have to specify `S` along with `T`. - -E.g. Instead of `EveryKmer{DNACodon,typeof(s)}(s)`, you can just use `EveryKmer{DNACodon}(s)` -""" -function EveryKmer{T}( - seq::S, - start=firstindex(seq), - stop=lastindex(seq), -) where {T <: Kmer, S <: BioSequence} - return EveryKmer{T, S}(seq, start, stop) -end - -""" - EveryKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - -Convenience outer constructor so yyou don't have to specify full `Kmer` typing. - -In order to deduce `Kmer{A,K,N}`, `A` is taken from the input `seq` type, `K` is -taken from `::Val{K}`, and `N` is deduced using `A` and `K`. - -E.g. Instead of `EveryKmer{DNAKmer{3,1}}(s)`, or `EveryKmer{DNACodon}(s)`, -you can use `EveryKmer(s, Val(3))` -""" -function EveryKmer( - seq::BioSequence{A}, - ::Val{K}, - start=firstindex(seq), - stop=lastindex(seq), -) where {A, K} - return EveryKmer{Kmer{A, K}}(seq, start, stop) -end - -Base.step(x::EveryKmer) = 1 - -## Initial iteration without state. -@inline function Base.iterate(it::EveryKmer{Kmer{A, K, N}, LongSequence{A}}) where {A, K, N} - kmer = _build_kmer_data(Kmer{A, K, N}, it.seq, 1) - if isnothing(kmer) - return nothing - else - return (1, Kmer{A, K, N}(kmer)), (K, kmer) - end -end - -@inline function Base.iterate( - it::EveryKmer{Kmer{A, K, N}, LongSequence{A}}, - state, -) where {A, K, N} - i, fwkmer = state - i += 1 - if i > it.stop - return nothing - else - bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - bits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) - kmer = leftshift_carry(fwkmer, bps, bits) - pos = i - K + 1 - return (pos, Kmer{A, K, N}(kmer)), (i, kmer) - end -end - -## Special case where iterating over 2-Bit encoded kmers in a 4-Bit encoded sequence, -## behaviour is to produce kmers by skipping over the ambiguous sites. - -const kmerbits = ( - UInt64(0xff), - UInt64(0x00), - UInt64(0x01), - UInt64(0xff), - UInt64(0x02), - UInt64(0xff), - UInt64(0xff), - UInt64(0xff), - UInt64(0x03), - UInt64(0xff), - UInt64(0xff), - UInt64(0xff), - UInt64(0xff), - UInt64(0xff), - UInt64(0xff), - UInt64(0xff), -) - -@inline Base.IteratorSize( - ::Type{<:EveryKmer{Kmer{A, N, K}, S}}, -) where { - A <: NucleicAcidAlphabet{2}, - N, - K, - B <: NucleicAcidAlphabet{4}, - S <: BioSequence{B}, -} = Base.SizeUnknown() - -@inline function Base.iterate( - it::EveryKmer{Kmer{A, K, N}, S}, - state=(it.start - 1, 1, blank_ntuple(Kmer{A, K, N})), -) where { - A <: NucleicAcidAlphabet{2}, - B <: NucleicAcidAlphabet{4}, - S <: BioSequence{B}, - K, - N, -} - i, filled, fwkmer = state - i += 1 - filled -= 1 - - while i ≤ it.stop - @inbounds nt = reinterpret(UInt8, it.seq[i]) - @inbounds fbits = kmerbits[nt + 1] - fwkmer = leftshift_carry(fwkmer, 2, fbits) - filled = ifelse(fbits == UInt64(0xff), 0, filled + 1) - if filled == K - return (i - K + 1, Kmer{A, K, N}(fwkmer)), (i, filled, fwkmer) - end - i += 1 - end - return nothing -end diff --git a/src/old_iter/SpacedCanonicalKmers.jl b/src/old_iter/SpacedCanonicalKmers.jl deleted file mode 100644 index 9c855c4..0000000 --- a/src/old_iter/SpacedCanonicalKmers.jl +++ /dev/null @@ -1,168 +0,0 @@ - -""" - SpacedCanonicalKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} - -An iterator over every valid `T<:Kmer` separated by a `step` parameter, in a given -longer `BioSequence`, between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by `max(0, K - step)` bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not consistent, but the - reading frame will be preserved. - In addition, the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct SpacedCanonicalKmers{T <: Kmer, S <: BioSequence} <: AbstractKmerIterator{T, S} - seq::S - start::Int - step::Int - stop::Int - filled::Int # This is cached for speed - increment::Int # This is cached for speed - - function SpacedCanonicalKmers{T, S}( - seq::S, - step::Int, - start::Int, - stop::Int, - ) where {T <: Kmer, S <: BioSequence} - T′ = kmertype(T) - checkmer(T′) # Should inline and constant fold. - if step <= 1 - throw(ArgumentError("step size must be greater than 1")) - end - filled = max(0, ksize(T′) - step) - increment = max(1, step - ksize(T′) + 1) - return new{T′, S}(seq, start, step, stop, filled, increment) - end -end - -""" - SpacedCanonicalKmers{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -Convenience outer constructor so you don't have to specify `S` along with `T`. - -E.g. Instead of `SpacedCanonicalKmers{DNACodon,typeof(s)}(s, 3)`, you can just use `SpacedCanonicalKmers{DNACodon}(s, 3)` -""" -function SpacedCanonicalKmers{T}( - seq::S, - step::Int, - start=firstindex(seq), - stop=lastindex(seq), -) where {T <: Kmer, S <: BioSequence} - return SpacedCanonicalKmers{T, S}(seq, step, start, stop) -end - -""" - SpacedCanonicalKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - -Convenience outer constructor so yyou don't have to specify full `Kmer` typing. - -In order to deduce `Kmer{A,K,N}`, `A` is taken from the input `seq` type, `K` is -taken from `::Val{K}`, and `N` is deduced using `A` and `K`. - -E.g. Instead of `SpacedCanonicalKmers{DNAKmer{3,1}}(s, 3)`, or `SpacedCanonicalKmers{DNACodon}(s, 3)`, -you can use `SpacedCanonicalKmers(s, Val(3), 3)` -""" -function SpacedCanonicalKmers( - seq::BioSequence{A}, - ::Val{K}, - step::Int, - start=firstindex(seq), - stop=lastindex(seq), -) where {A, K} - return SpacedCanonicalKmers{Kmer{A, K}}(seq, step, start, stop) -end - -Base.step(x::SpacedCanonicalKmers) = x.step - -@inline function Base.iterate( - it::SpacedCanonicalKmers{Kmer{A, K, N}, LongSequence{A}}, -) where {A, K, N} - fwkmer = _build_kmer_data(Kmer{A, K, N}, it.seq, 1) - if isnothing(fwkmer) - return nothing - else - rshift = n_unused(Kmer{A, K, N}) * BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry( - _reverse( - BioSequences.BitsPerSymbol(A()), - _complement_bitpar(A(), fwkmer...)..., - ), - rshift, - ) - return (1, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), (K, fwkmer, rvkmer) - end -end - -@inline function Base.iterate( - it::SpacedCanonicalKmers{Kmer{A, K, N}, LongSequence{A}}, - state, -) where {A, K, N} - i, fwkmer, rvkmer = state - filled = it.filled - i += it.increment - - for _ in filled:(K - 1) - if i > it.stop - return nothing - else - bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - rshift = (64 - (n_unused(Kmer{A, K, N}) + 1) * bps) # Based on type info, should constant fold. - mask = (one(UInt64) << bps) - one(UInt64) # Based on type info, should constant fold. - fbits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) - rbits = (BioSequences.complement_bitpar(fbits, A()) & mask) << rshift - fwkmer = leftshift_carry(fwkmer, bps, fbits) - rvkmer = rightshift_carry(rvkmer, bps, rbits) - i += 1 - end - end - pos = i - K + 1 - return (pos, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), (i, fwkmer, rvkmer) -end - -@inline function Base.iterate( - it::SpacedCanonicalKmers{Kmer{A, K, N}, LongSequence{B}}, - state=( - it.start - it.increment, - 1, - 0, - blank_ntuple(Kmer{A, K, N}), - blank_ntuple(Kmer{A, K, N}), - ), -) where {A <: NucleicAcidAlphabet{2}, B <: NucleicAcidAlphabet{4}, K, N} - i, pos, filled, fwkmer, rvkmer = state - i += it.increment - - while i ≤ it.stop - nt = reinterpret(UInt8, @inbounds getindex(it.seq, i)) - @inbounds fbits = UInt64(kmerbits[nt + 1]) - rbits = ~fbits & typeof(fbits)(0x03) - if fbits == 0xff # ambiguous - filled = 0 - # Find the beginning of next possible kmer after i - pos = i + it.step - Core.Intrinsics.urem_int(i - pos, it.step) - i = pos - 1 - else - filled += 1 - fwkmer = leftshift_carry(fwkmer, 2, fbits) - rvkmer = rightshift_carry(rvkmer, 2, UInt64(rbits) << (62 - (64N - 2K))) - end - if filled == K - state = (i, i - K + 1 + it.step, it.filled, fwkmer, rvkmer) - return (pos, min(Kmer{A, K, N}(fwkmer), Kmer{A, K, N}(rvkmer))), state - end - i += 1 - end - return nothing -end diff --git a/src/old_iter/SpacedKmers.jl b/src/old_iter/SpacedKmers.jl deleted file mode 100644 index fd5abbf..0000000 --- a/src/old_iter/SpacedKmers.jl +++ /dev/null @@ -1,150 +0,0 @@ - -""" - SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} - -An iterator over every valid `T<:Kmer` separated by a `step` parameter, in a given -longer `BioSequence`, between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by `max(0, K - step)` bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not consistent, but the - reading frame will be preserved. - In addition, the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct SpacedKmers{T <: Kmer, S <: BioSequence} <: AbstractKmerIterator{T, S} - seq::S - start::Int - step::Int - stop::Int - filled::Int # This is cached for speed - increment::Int # This is cached for speed - - function SpacedKmers{T, S}( - seq::S, - step::Int, - start::Int, - stop::Int, - ) where {T <: Kmer, S <: BioSequence} - T′ = kmertype(T) - checkmer(T′) # Should inline and constant fold. - if step <= 1 - throw(ArgumentError("step size must be greater than 1")) - end - filled = max(0, ksize(T′) - step) - increment = max(1, step - ksize(T′) + 1) - return new{T′, S}(seq, start, step, stop, filled, increment) - end -end - -""" - SpacedKmers{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -Convenience outer constructor so you don't have to specify `S` along with `T`. - -E.g. Instead of `SpacedKmers{DNACodon,typeof(s)}(s, 3)`, you can just use `SpacedKmers{DNACodon}(s, 3)` -""" -function SpacedKmers{T}( - seq::S, - step::Int, - start=firstindex(seq), - stop=lastindex(seq), -) where {T <: Kmer, S <: BioSequence} - return SpacedKmers{T, S}(seq, step, start, stop) -end - -""" - SpacedKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - -Convenience outer constructor so yyou don't have to specify full `Kmer` typing. - -In order to deduce `Kmer{A,K,N}`, `A` is taken from the input `seq` type, `K` is -taken from `::Val{K}`, and `N` is deduced using `A` and `K`. - -E.g. Instead of `SpacedKmers{DNAKmer{3,1}}(s, 3)`, or `SpacedKmers{DNACodon}(s, 3)`, -you can use `SpacedKmers(s, Val(3), 3)` -""" -function SpacedKmers( - seq::BioSequence{A}, - ::Val{K}, - step::Int, - start=firstindex(seq), - stop=lastindex(seq), -) where {A, K} - return SpacedKmers{Kmer{A, K}}(seq, step, start, stop) -end - -Base.step(x::SpacedKmers) = x.step - -@inline function Base.iterate( - it::SpacedKmers{Kmer{A, K, N}, LongSequence{A}}, -) where {A, K, N} - kmer = _build_kmer_data(Kmer{A, K, N}, it.seq, 1) - if isnothing(kmer) - return nothing - else - # Get the reverse. - alph = Alphabet(Kmer{A, K, N}) - return (1, Kmer{A, K, N}(kmer)), (K, kmer) - end -end - -@inline function Base.iterate( - it::SpacedKmers{Kmer{A, K, N}, LongSequence{A}}, - state, -) where {A, K, N} - i, kmer = state - filled = it.filled - i += it.increment - - for _ in filled:(K - 1) - if i > it.stop - return nothing - else - bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - bits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) - kmer = leftshift_carry(kmer, bps, bits) - i += 1 - end - end - pos = i - K + 1 - return (pos, Kmer{A, K, N}(kmer)), (i, kmer) -end - -@inline function Base.iterate( - it::SpacedKmers{Kmer{A, K, N}, LongSequence{B}}, - state=(it.start - it.increment, 1, 0, blank_ntuple(Kmer{A, K, N})), -) where {A <: NucleicAcidAlphabet{2}, B <: NucleicAcidAlphabet{4}, K, N} - i, pos, filled, kmer = state - i += it.increment - - while i ≤ it.stop - nt = reinterpret(UInt8, @inbounds getindex(it.seq, i)) - @inbounds bits = UInt64(kmerbits[nt + 1]) - if bits == 0xff # ambiguous - filled = 0 - # Find the beginning of next possible kmer after i - pos = i + it.step - Core.Intrinsics.urem_int(i - pos, it.step) - i = pos - 1 - else - filled += 1 - kmer = leftshift_carry(kmer, 2, bits) - end - if filled == K - state = (i, i - K + 1 + it.step, it.filled, kmer) - return (pos, Kmer{A, K, N}(kmer)), state - end - i += 1 - end - return nothing -end From 47b4e039e4ee317e502b1083eb853a2f5414b0b8 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 31 Dec 2023 10:20:19 +0100 Subject: [PATCH 25/33] Add FwRvIterator --- docs/src/iteration.md | 7 ++ src/Kmers.jl | 1 + src/iterators/CanonicalKmers.jl | 192 ++++++++++++++++++------------ src/iterators/FwKmers.jl | 8 +- src/iterators/SpacedKmers.jl | 16 ++- src/iterators/UnambiguousKmers.jl | 8 +- 6 files changed, 144 insertions(+), 88 deletions(-) diff --git a/docs/src/iteration.md b/docs/src/iteration.md index 8dfa0de..543d017 100644 --- a/docs/src/iteration.md +++ b/docs/src/iteration.md @@ -37,6 +37,13 @@ FwRNAMers FwAAMers ``` +### `FwRvIterator` +This iterates over a nucleic acid sequence. For every kmer it encounters, it outputs the kmer and its reverse complement. + +```@docs +FwRvIterator +``` + ### `CanonicalKmers` This iterator is similar to [`FwKmers`](@ref), however, for each `Kmer` encountered, it returns the _canonical_ kmer. diff --git a/src/Kmers.jl b/src/Kmers.jl index 9dc31f0..c8ccfd4 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -32,6 +32,7 @@ export Kmer, FwDNAMers, FwRNAMers, FwAAMers, + FwRvIterator, CanonicalKmers, CanonicalDNAMers, CanonicalRNAMers, diff --git a/src/iterators/CanonicalKmers.jl b/src/iterators/CanonicalKmers.jl index 075d7e7..6ece49d 100644 --- a/src/iterators/CanonicalKmers.jl +++ b/src/iterators/CanonicalKmers.jl @@ -1,166 +1,159 @@ """ - CanonicalKmers{A <: NucleicAcidAlphabet, K, S} - -Iterator of canonical nucleic acid kmers. The result of this iterator is equivalent -to calling `canonical` on each value of a `FwKmers` iterator, but may be more -efficient. + FwRvIterator{A <: NucleicAcidAlphabet, K, S} -!!! note - When counting small kmers, it may be more efficient to count `FwKmers`, - then call `canonical` only once per unique kmer. +Iterates 2-tuples of `(forward, reverse_complement)` kmers of type `Kmer{A, K}`. +`S` signifies the type of the underlying sequence, -Can be constructed more conventiently with the constructors `CanonicalDNAMers{K}(s)` -`CanonicalRNAMers{K}(s)` +See also: [`FwKmers`](@ref), [`CanonicalKmers`](@ref) # Examples: ```jldoctest -julia> collect(CanonicalRNAMers{3}("AGCGA")) -3-element Vector{Kmer{RNAAlphabet{2}, 3, 1}}: - AGC - CGC - CGA +julia> collect(FwRvIterator{DNAAlphabet{4}, 3}("AGCGT")) +3-element Vector{Tuple{Mer{3, DNAAlphabet{4}, 1}, Mer{3, DNAAlphabet{4}, 1}}}: + (AGC, GCT) + (GCG, CGC) + (CGT, ACG) + +julia> collect(FwRvIterator{DNAAlphabet{2}, 3}("AGNGT")) +ERROR: cannot encode 0x4e (Char 'N') in DNAAlphabet{2} +[...] ``` """ -struct CanonicalKmers{A <: NucleicAcidAlphabet, K, S} <: AbstractKmerIterator{A, K} - it::FwKmers{A, K, S} +struct FwRvIterator{A <: NucleicAcidAlphabet, K, S} + seq::S + + function FwRvIterator{A, K, S}(seq::S) where {A, K, S} + K isa Int || error("K must be an Int") + K > 0 || error("K must be at least 1") + new{A, K, S}(seq) + end end -source_type(::Type{CanonicalKmers{A, K, S}}) where {A, K, S} = S -@inline Base.length(it::CanonicalKmers) = length(it.it) +source_type(::Type{FwRvIterator{A, K, S}}) where {A, K, S} = S +kmertype(::Type{<:FwRvIterator{A, K}}) where {A, K} = derive_type(Kmer{A, K}) +kmertype(it::FwRvIterator) = kmertype(typeof(it)) +Base.eltype(T::Type{<:FwRvIterator{A, K}}) where {A, K} = + Tuple{K, K} where {K <: kmertype(T)} -# Constructors -function CanonicalKmers{A, K}(s::S) where {S, A <: NucleicAcidAlphabet, K} - CanonicalKmers{A, K, S}(FwKmers{A, K}(s)) +@inline function Base.length(it::FwRvIterator{A, K, S}) where {A, K, S} + src = used_source(RecodingScheme(A(), S), it.seq) + max(0, length(src) - K + 1) end -function CanonicalKmers{A, K, S}(s::S) where {S, A <: NucleicAcidAlphabet, K} - CanonicalKmers{A, K, S}(FwKmers{A, K}(s)) -end - -"`CanonicalDNAMers{K, S}`: Alias for `CanonicalKmers{DNAAlphabet{2}, K, S}`" -const CanonicalDNAMers{K, S} = CanonicalKmers{DNAAlphabet{2}, K, S} -"`CanonicalRNAMers{K, S}`: Alias for `CanonicalKmers{RNAAlphabet{2}, K, S}`" -const CanonicalRNAMers{K, S} = CanonicalKmers{RNAAlphabet{2}, K, S} +FwRvIterator{A, K}(s) where {A <: Alphabet, K} = FwRvIterator{A, K, typeof(s)}(s) -@inline function Base.iterate(it::CanonicalKmers{A, K, S}, state...) where {A, K, S} +@inline function Base.iterate(it::FwRvIterator{A, K, S}, state...) where {A, K, S} iterate_kmer(RecodingScheme(A(), S), it, state...) end # For the first kmer, we extract it, then reverse complement. # When it's not done incrementally, it's faster to RC the whole # kmer at once. -@inline function iterate_kmer(R::RecodingScheme, it::CanonicalKmers) - length(it.it.seq) < ksize(eltype(it)) && return nothing - fw = unsafe_extract(R, eltype(it), it.it.seq, 1) +@inline function iterate_kmer(R::RecodingScheme, it::FwRvIterator{A, K}) where {A, K} + length(it.seq) < K && return nothing + fw = unsafe_extract(R, kmertype(it), it.seq, 1) rv = reverse_complement(fw) - (fw < rv ? fw : rv, (fw, rv, ksize(eltype(it)) + 1)) + ((fw, rv), (fw, rv, K + 1)) end # Here, we need to convert to an abstractvector @inline function iterate_kmer( R::AsciiEncode, - it::CanonicalKmers{A, K, S}, + it::FwRvIterator{A, K, S}, ) where {A <: NucleicAcidAlphabet, K, S <: Bytes} - src = used_source(RecodingScheme(A(), S), it.it.seq) + src = used_source(RecodingScheme(A(), S), it.seq) Base.require_one_based_indexing(src) - length(src) < ksize(eltype(it)) && return nothing - fw = unsafe_extract(R, eltype(it), src, 1) + length(src) < K && return nothing + fw = unsafe_extract(R, kmertype(it), src, 1) rv = reverse_complement(fw) - (fw < rv ? fw : rv, (fw, rv, ksize(eltype(it)) + 1)) + ((fw, rv), (fw, rv, K + 1)) end @inline function iterate_kmer( ::GenericRecoding, - it::CanonicalKmers, + it::FwRvIterator, state::Tuple{Kmer, Kmer, Int}, ) (fw, rv, i) = state - i > length(it.it.seq) && return nothing - symbol = convert(eltype(fw), @inbounds it.it.seq[i]) + i > length(it.seq) && return nothing + symbol = convert(eltype(fw), @inbounds it.seq[i]) fw = shift(fw, symbol) rv = shift_first(rv, complement(symbol)) - (fw < rv ? fw : rv, (fw, rv, i + 1)) + ((fw, rv), (fw, rv, i + 1)) end @inline function iterate_kmer( ::Copyable, - it::CanonicalKmers{<:TwoBit, K, <:BioSequence{<:TwoBit}}, + it::FwRvIterator{<:TwoBit, K, <:BioSequence{<:TwoBit}}, state::Tuple{Kmer, Kmer, Int}, ) where {K} (fw, rv, i) = state - i > length(it.it.seq) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, i)) + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i)) fw = shift_encoding(fw, encoding) rv = shift_first_encoding(rv, encoding ⊻ 0x03) - (fw < rv ? fw : rv, (fw, rv, i + 1)) + ((fw, rv), (fw, rv, i + 1)) end @inline function iterate_kmer( ::Copyable, - it::CanonicalKmers{<:FourBit, K, <:BioSequence{<:FourBit}}, + it::FwRvIterator{<:FourBit, K, <:BioSequence{<:FourBit}}, state::Tuple{Kmer, Kmer, Int}, ) where {K} (fw, rv, i) = state - i > length(it.it.seq) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, i)) + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i)) fw = shift_encoding(fw, encoding) rc_encoding = reinterpret(UInt8, complement(reinterpret(eltype(rv), encoding % UInt8))) % UInt rv = shift_first_encoding(rv, rc_encoding) - (fw < rv ? fw : rv, (fw, rv, i + 1)) + ((fw, rv), (fw, rv, i + 1)) end -@inline function iterate_kmer( - ::TwoToFour, - it::CanonicalKmers, - state::Tuple{Kmer, Kmer, Int}, -) +@inline function iterate_kmer(::TwoToFour, it::FwRvIterator, state::Tuple{Kmer, Kmer, Int}) (fw, rv, i) = state - i > length(it.it.seq) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, i)) + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i)) fw = shift_encoding(fw, left_shift(UInt(1), encoding)) rv = shift_first_encoding(rv, left_shift(UInt(1), encoding ⊻ 0x03)) - (fw < rv ? fw : rv, (fw, rv, i + 1)) + ((fw, rv), (fw, rv, i + 1)) end @inline function iterate_kmer( ::FourToTwo, - it::CanonicalKmers{A, K, <:BioSequence}, + it::FwRvIterator{A, K, <:BioSequence}, state::Tuple{Kmer, Kmer, Int}, ) where {A, K} (fw, rv, i) = state - i > length(it.it.seq) && return nothing - encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, i))::UInt + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt if count_ones(encoding) != 1 throw( BioSequences.EncodeError( Alphabet(fw), - reinterpret(eltype(it.it.seq), encoding % UInt8), + reinterpret(eltype(it.seq), encoding % UInt8), ), ) end enc = trailing_zeros(encoding) % UInt fw = shift_encoding(fw, enc) rv = shift_first_encoding(rv, enc ⊻ 0x03) - (fw < rv ? fw : rv, (fw, rv, i + 1)) + ((fw, rv), (fw, rv, i + 1)) end @inline function iterate_kmer( ::AsciiEncode, - it::CanonicalKmers, + it::FwRvIterator{A}, state::Tuple{Kmer, Kmer, Int}, -) - src = used_source( - RecodingScheme(Alphabet(eltype(it)), source_type(typeof(it))), - it.it.seq, - ) +) where {A} + src = used_source(RecodingScheme(A(), source_type(typeof(it))), it.seq) Base.require_one_based_indexing(src) (fw, rv, i) = state i > length(src) && return nothing byte = @inbounds src[i] - encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), byte) + encoding = BioSequences.ascii_encode(A(), byte) if encoding > 0x7f - throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) + throw(BioSequences.EncodeError(A(), repr(byte))) end # Hopefully this branch is eliminated at compile time... rc_encoding = if Alphabet(fw) isa FourBit @@ -172,5 +165,56 @@ end end fw = shift_encoding(fw, encoding % UInt) rv = shift_first_encoding(rv, rc_encoding % UInt) - (fw < rv ? fw : rv, (fw, rv, i + 1)) + ((fw, rv), (fw, rv, i + 1)) +end + +""" + CanonicalKmers{A <: NucleicAcidAlphabet, K, S} + +Iterator of canonical nucleic acid kmers. The result of this iterator is equivalent +to calling `canonical` on each value of a `FwKmers` iterator, but may be more +efficient. + +!!! note + When counting small kmers, it may be more efficient to count `FwKmers`, + then call `canonical` only once per unique kmer. + +Can be constructed more conventiently with the constructors `CanonicalDNAMers{K}(s)` +`CanonicalRNAMers{K}(s)` + +# Examples: +```jldoctest +julia> collect(CanonicalRNAMers{3}("AGCGA")) +3-element Vector{Kmer{RNAAlphabet{2}, 3, 1}}: + AGC + CGC + CGA +``` +""" +struct CanonicalKmers{A <: NucleicAcidAlphabet, K, S} <: AbstractKmerIterator{A, K} + it::FwRvIterator{A, K, S} +end + +source_type(::Type{CanonicalKmers{A, K, S}}) where {A, K, S} = S +@inline Base.length(it::CanonicalKmers) = length(it.it) + +# Constructors +function CanonicalKmers{A, K}(s::S) where {S, A <: NucleicAcidAlphabet, K} + CanonicalKmers{A, K, S}(FwRvIterator{A, K}(s)) +end +function CanonicalKmers{A, K, S}(s::S) where {S, A <: NucleicAcidAlphabet, K} + CanonicalKmers{A, K, S}(FwRvIterator{A, K}(s)) +end + +"`CanonicalDNAMers{K, S}`: Alias for `CanonicalKmers{DNAAlphabet{2}, K, S}`" +const CanonicalDNAMers{K, S} = CanonicalKmers{DNAAlphabet{2}, K, S} + +"`CanonicalRNAMers{K, S}`: Alias for `CanonicalKmers{RNAAlphabet{2}, K, S}`" +const CanonicalRNAMers{K, S} = CanonicalKmers{RNAAlphabet{2}, K, S} + +@inline function Base.iterate(it::CanonicalKmers{A, K, S}, state...) where {A, K, S} + it = iterate(it.it, state...) + isnothing(it) && return nothing + ((fw, rv), state) = it + (fw < rv ? fw : rv, state) end diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index 3fb1d5a..4ed24d2 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -1,5 +1,5 @@ """ - FwKmers{A <: Alphabet, K, S} + FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} Iterator of forward kmers. `S` signifies the type of the underlying sequence, and the eltype of the iterator is `Kmer{A, K, N}` with the appropriate `N`. @@ -32,7 +32,7 @@ source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S @inline function Base.length(it::FwKmers{A, K, S}) where {A, K, S} src = used_source(RecodingScheme(A(), S), it.seq) - max(0, length(src) - ksize(eltype(it)) + 1) + max(0, length(src) - K + 1) end # Constructors @@ -66,9 +66,9 @@ end ) where {A <: Alphabet, K, S <: Bytes} src = used_source(RecodingScheme(A(), S), it.seq) Base.require_one_based_indexing(src) - length(src) < ksize(eltype(it)) && return nothing + length(src) < K && return nothing kmer = unsafe_extract(R, eltype(it), src, 1) - (kmer, (kmer, ksize(eltype(it)) + 1)) + (kmer, (kmer, K + 1)) end @inline function iterate_kmer(::GenericRecoding, it::FwKmers, state::Tuple{Kmer, Int}) diff --git a/src/iterators/SpacedKmers.jl b/src/iterators/SpacedKmers.jl index 6ace5af..792dfa6 100644 --- a/src/iterators/SpacedKmers.jl +++ b/src/iterators/SpacedKmers.jl @@ -1,5 +1,5 @@ """ - SpacedKmers{A <: Alphabet, K, J, S} + SpacedKmers{A <: Alphabet, K, J, S} <: AbstractKmerIterator{A, K} Iterator of kmers with step size. `J` signifies the step size, `S` the type of the underlying sequence, and the eltype of the iterator @@ -86,14 +86,14 @@ end R::RecodingScheme, it::SpacedKmers{A, K}, ) where {A <: Alphabet, K} - length(it.seq) < ksize(eltype(it)) && return nothing + length(it.seq) < K && return nothing kmer = unsafe_extract( R, eltype(it), used_source(RecodingScheme(A(), source_type(typeof(it))), it.seq), 1, ) - next_index = 1 + max(stepsize(it), ksize(eltype(it))) + next_index = 1 + max(stepsize(it), K) (kmer, (kmer, next_index)) end @@ -105,13 +105,17 @@ end ) where {A <: Alphabet, K, J, S <: Bytes} src = used_source(RecodingScheme(A(), S), it.seq) Base.require_one_based_indexing(src) - length(src) < ksize(eltype(it)) && return nothing + length(src) < K && return nothing kmer = unsafe_extract(R, eltype(it), src, 1) - next_index = 1 + max(stepsize(it), ksize(eltype(it))) + next_index = 1 + max(stepsize(it), K) (kmer, (kmer, next_index)) end -@inline function iterate_kmer(::RecodingScheme, it::SpacedKmers{A, K, J, S}, state) where {A, K, S, J} +@inline function iterate_kmer( + ::RecodingScheme, + it::SpacedKmers{A, K, J, S}, + state, +) where {A, K, S, J} src = used_source(RecodingScheme(A(), S), it.seq) R = RecodingScheme(A(), S) Base.require_one_based_indexing(src) diff --git a/src/iterators/UnambiguousKmers.jl b/src/iterators/UnambiguousKmers.jl index 57b2aaa..701678d 100644 --- a/src/iterators/UnambiguousKmers.jl +++ b/src/iterators/UnambiguousKmers.jl @@ -1,5 +1,5 @@ """ - UnambiguousKmers{A <: Union{DNAAlphabet{2}, RNAAlphabet{2}}, K, S} + UnambiguousKmers{A <: TwoBit, K, S} <: AbstractKmerIterator{A, K} Iterator of 2-bit nucleic acid kmers. This differs from `FwKmers` in that any kmers containing ambiguous nucleotides are skipped, whereas using `FwKmers`, they result @@ -45,7 +45,7 @@ const UnambiguousDNAMers{K, S} = UnambiguousKmers{DNAAlphabet{2}, K, S} const UnambiguousRNAMers{K, S} = UnambiguousKmers{RNAAlphabet{2}, K, S} @inline function Base.iterate(it::UnambiguousKmers{A, K, S}) where {A, K, S} - state = (eltype(it)(unsafe, zero_tuple(eltype(it))), ksize(eltype(it)), 1) + state = (eltype(it)(unsafe, zero_tuple(eltype(it))), K, 1) iterate_kmer(RecodingScheme(A(), S), it, state) end @@ -99,7 +99,7 @@ end if encoding == 0xff throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) elseif encoding == 0xf0 - remaining = ksize(eltype(it)) + remaining = K else remaining -= 1 kmer = shift_encoding(kmer, encoding % UInt) @@ -119,7 +119,7 @@ end encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, index))::UInt kmer = shift_encoding(kmer, (trailing_zeros(encoding)) % UInt) index += 1 - remaining = isone(count_ones(encoding)) ? remaining - 1 : ksize(eltype(it)) + remaining = isone(count_ones(encoding)) ? remaining - 1 : K end (kmer, (kmer, 1, index)) end From c280f48dbb19768c597769c79c1f9f50735bfa6d Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 31 Dec 2023 10:20:25 +0100 Subject: [PATCH 26/33] Push doc preview --- .github/workflows/UnitTests.yml | 18 ++++++++++++++++++ docs/make.jl | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index cff1346..71a40b6 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -42,3 +42,21 @@ jobs: name: codecov-umbrella fail_ci_if_error: false token: ${{ secrets.CODECOV_TOKEN }} + + docs: + name: Documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@latest + with: + version: '1' + - run: | + julia --project=docs -e ' + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate()' + - run: julia --project=docs docs/make.jl + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} diff --git a/docs/make.jl b/docs/make.jl index cd5d0d2..8b4ff1e 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -9,7 +9,7 @@ DocMeta.setdocmeta!( makedocs(; modules=[Kmers], - format=Documenter.HTML(), + format=Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"), sitename="Kmers.jl", pages=[ "Home" => "index.md", From 78fe5ddab7906b418b689f3769343b42119f31fb Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 31 Dec 2023 11:10:13 +0100 Subject: [PATCH 27/33] Misc cleanup --- .github/workflows/Documentation.yml | 1 - Project.toml | 2 +- docs/make.jl | 2 +- src/kmer.jl | 3 +- src/transformations.jl | 27 ++++----- src/tuple_bitflipping.jl | 85 ----------------------------- 6 files changed, 15 insertions(+), 105 deletions(-) diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml index 35d4097..c553475 100644 --- a/.github/workflows/Documentation.yml +++ b/.github/workflows/Documentation.yml @@ -21,6 +21,5 @@ jobs: run: julia --color=yes --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' - name: Build and deploy env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key run: julia --color=yes --project=docs/ docs/make.jl diff --git a/Project.toml b/Project.toml index 4f8249f..747fb56 100644 --- a/Project.toml +++ b/Project.toml @@ -20,4 +20,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [targets] -test = ["Test", "Random", "BioSequences"] +test = ["Test", "Random"] diff --git a/docs/make.jl b/docs/make.jl index 8b4ff1e..d762c90 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -9,7 +9,7 @@ DocMeta.setdocmeta!( makedocs(; modules=[Kmers], - format=Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"), + format=Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"), sitename="Kmers.jl", pages=[ "Home" => "index.md", diff --git a/src/kmer.jl b/src/kmer.jl index 89f2501..beca16f 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -143,8 +143,7 @@ end @inline zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), Val{nsize(T)}()) -# TODO: Should this somehow throw a MethodError if N is already parameterized? -@inline function zero_kmer(T::Type{Kmer{A, K}}) where {A, K} +@inline function zero_kmer(::Type{<:Kmer{A, K}}) where {A, K} T2 = derive_type(Kmer{A, K}) T2(unsafe, zero_tuple(T2)) end diff --git a/src/transformations.jl b/src/transformations.jl index bb51127..7681422 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -47,13 +47,17 @@ function BioSequences.translate( allow_ambiguous_codons::Bool=true, # noop in this method alternative_start::Bool=false, ) + iszero(ksize(typeof(seq))) && return mer""a n_aa, remainder = divrem(length(seq), 3) iszero(remainder) || error("LongRNA length is not divisible by three. Cannot translate.") N = n_coding_elements(Kmer{AminoAcidAlphabet, n_aa}) T = Kmer{AminoAcidAlphabet, n_aa, N} data = zero_tuple(T) - @inbounds for i in 1:n_aa + # In the next two lines: If alternative_start, we shift in the encoding of M + # to first place, then we skip the first 3 nucleotides + (_, data) = leftshift_carry(data, 8, UInt(0x0c) * alternative_start) + @inbounds for i in (1 + (3 * alternative_start)):n_aa a = seq[3i - 2] b = seq[3i - 1] c = seq[3i - 0] @@ -63,18 +67,13 @@ function BioSequences.translate( (_, data) = leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) end - result = T(unsafe, data) - if alternative_start && !iszero(ksize(typeof(seq))) - return setindex(result, 1, AA_M) - else - return result - end + T(unsafe, data) end function BioSequences.translate( seq::Kmer{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}}; code::BioSequences.GeneticCode=BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool=true, # noop in this method + allow_ambiguous_codons::Bool=true, alternative_start::Bool=false, ) n_aa, remainder = divrem(length(seq), 3) @@ -83,7 +82,10 @@ function BioSequences.translate( N = n_coding_elements(Kmer{AminoAcidAlphabet, n_aa}) T = Kmer{AminoAcidAlphabet, n_aa, N} data = zero_tuple(T) - @inbounds for i in 1:n_aa + # In the next two lines: If alternative_start, we shift in the encoding of M + # to first place, then we skip the first 3 nucleotides + (_, data) = leftshift_carry(data, 8, UInt(0x0c) * alternative_start) + @inbounds for i in (1 + (3 * alternative_start)):n_aa a = reinterpret(RNA, seq[3i - 2]) b = reinterpret(RNA, seq[3i - 1]) c = reinterpret(RNA, seq[3i - 0]) @@ -98,10 +100,5 @@ function BioSequences.translate( (_, data) = leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) end - result = T(unsafe, data) - if alternative_start && !iszero(ksize(typeof(seq))) - return setindex(result, 1, AA_M) - else - return result - end + T(unsafe, data) end diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index bfbc4de..974bf55 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -1,88 +1,3 @@ - -# TODO: this should end up in BioSequences.jl? - -#= -"Extract the element stored in a packed bitarray referred to by bidx." -@inline function BioSequences.extract_encoded_element( - bidx::BioSequences.BitIndex{N, W}, - data::NTuple{n, W}, -) where {N, n, W} - @inbounds chunk = data[BioSequences.index(bidx)] - offchunk = chunk >> (BioSequences.bitwidth(W) - N - BioSequences.offset(bidx)) - return offchunk & BioSequences.bitmask(bidx) -end - -""" - _cliphead(by::Integer, head::UInt64, tail...) - -A method used to mask the first `by` MSB's in `head`, before catting it with -tail to return a NTuple. - -This is used internally to mask the first `by` bits in the first word of a -NTuple of UInt64's. - -Notably it's used when constructing a Kmer from an existing NTuple of UInt64 -""" -@inline function _cliphead(by::Integer, head::UInt64, tail...) - return (head & (typemax(UInt64) >> by), tail...) -end - -# rightshift_carry & leftshift_carry - -# These methods are micro-optimised (or should be!!!) for shifting the bits in -# an NTuple of unsigned integers, carrying the bits "shifted off" one word -# over to the next word. The carry can also be "seeded" so as other methods like -# pushfirst and pushlast can be efficiently implemented without duplication of code -# or less efficient implementations that first shift and then insert an element. - -@inline function rightshift_carry( - x::NTuple{N, UInt64}, - nbits::Integer, - prevcarry=zero(UInt64), -) where {N} - return _rightshift_carry(nbits, prevcarry, x...) -end - -@inline function _rightshift_carry(nbits::Integer, carry::UInt64, head::UInt64, tail...) - return ( - (head >> nbits) | carry, - _rightshift_carry( - nbits, - (head & ((one(UInt64) << nbits) - 1)) << (64 - nbits), - tail..., - )..., - ) -end - -@inline _rightshift_carry(nbits::Integer, carry::UInt64) = () - -@inline function leftshift_carry( - x::NTuple{N, UInt64}, - nbits::Integer, - prevcarry::UInt64=zero(UInt64), -) where {N} - _, newbits = _leftshift_carry(nbits, prevcarry, x...) - return newbits -end - -@inline function _leftshift_carry(nbits::Integer, prevcarry::UInt64, head::UInt64, tail...) - carry, newtail = _leftshift_carry(nbits, prevcarry, tail...) - return head >> (64 - nbits), ((head << nbits) | carry, newtail...) -end - -@inline _leftshift_carry(nbits::Integer, prevcarry::UInt64) = prevcarry, () - -@inline function _reverse( - bpe::BioSequences.BitsPerSymbol{N}, - head::UInt64, - tail..., -) where {N} - return (_reverse(bpe, tail...)..., BioSequences.reversebits(head, bpe)) -end - -@inline _reverse(::BioSequences.BitsPerSymbol{N}) where {N} = () -=# - # These compile to raw CPU instructions and are therefore more # efficient than simply using << and >>> @inline function left_shift(x::Unsigned, n::Integer) From 320596bfb81d7b916b6579c18dc61fdfceb51972 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 2 Jan 2024 14:22:33 +0100 Subject: [PATCH 28/33] Add FxHash --- docs/make.jl | 1 + docs/src/hashing.md | 59 ++++ src/Kmers.jl | 1 + src/kmer.jl | 78 ++++- test/access.jl | 94 ------ test/comparisons.jl | 64 ---- test/construction_and_conversion.jl | 473 ---------------------------- test/debruijn_neighbors.jl | 10 - test/find.jl | 46 --- test/iteration.jl | 236 -------------- test/length.jl | 7 - test/mismatches.jl | 51 --- test/order.jl | 15 - test/print.jl | 37 --- test/random.jl | 26 -- test/shuffle.jl | 25 -- test/transformations.jl | 60 ---- 17 files changed, 129 insertions(+), 1154 deletions(-) create mode 100644 docs/src/hashing.md delete mode 100644 test/access.jl delete mode 100644 test/comparisons.jl delete mode 100644 test/construction_and_conversion.jl delete mode 100644 test/debruijn_neighbors.jl delete mode 100644 test/find.jl delete mode 100644 test/iteration.jl delete mode 100644 test/length.jl delete mode 100644 test/mismatches.jl delete mode 100644 test/order.jl delete mode 100644 test/print.jl delete mode 100644 test/random.jl delete mode 100644 test/shuffle.jl delete mode 100644 test/transformations.jl diff --git a/docs/make.jl b/docs/make.jl index d762c90..75ec010 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -16,6 +16,7 @@ makedocs(; "The Kmer type" => "kmers.md", "Iteration" => "iteration.md", "Translation" => "translation.md", + "Hashing" => "hashing.md", "FAQ" => "faq.md", ], authors="Jakob Nybo Nissen, Sabrina J. Ward, The BioJulia Organisation and other contributors.", diff --git a/docs/src/hashing.md b/docs/src/hashing.md new file mode 100644 index 0000000..9f71980 --- /dev/null +++ b/docs/src/hashing.md @@ -0,0 +1,59 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` + +!!! warning + The value of hashes are guaranteed to be reproducible for a given version + of Kmers.jl and Julia, but may __change__ in new minor versions of Julia + or Kmers.jl + +## Hashing +Kmers implement `Base.hash`, yielding a `UInt` value: + +```jldoctest; filter = r"^0x[0-9a-fA-F]+$" +julia> hash(mer"UGCUGUAC"r) +0xe5057d38c8907b22 +``` + +The implementation of `Base.hash` for kmers strikes a compromise between providing a high-quality (non-cryptographic) hash, while being reasonably fast. +While hash collisions can easily be found, they are unlikely to occur at random. +When kmers are of the same (or compatible) alphabets, different kmers hash to different values, even when they have the same underlying bitpattern: + +```jldoctest +julia> using BioSequences: encoded_data + +julia> a = mer"TAG"d; b = mer"AAAAAAATAG"d; + +julia> encoded_data(a) === encoded_data(b) +true + +julia> hash(a) == hash(b) +false +``` + +When they are of compatible alphabets, and have the same content, they hash to the same value. +Currently, only DNA and RNA of the alphabets `DNAAlphabet` and `RNAAlphabet` are compatible: + +```jldoctest +julia> a = mer"UUGU"r; b = mer"TTGT"d; + +julia> a == b # equal +true + +julia> a === b # not egal +false + +julia> hash(a) === hash(b) +true +``` + +For some applications, fast hashing is absolutely crucial. For these cases, Kmers.jl provides [`fx_hash`](@ref), which trades off hash quality for speed: + +```@docs +fx_hash +``` \ No newline at end of file diff --git a/src/Kmers.jl b/src/Kmers.jl index c8ccfd4..7d5e45e 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -18,6 +18,7 @@ export Kmer, reverse_translate, reverse_translate!, @mer_str, + fx_hash, # Immutable operations push, diff --git a/src/kmer.jl b/src/kmer.jl index beca16f..9421279 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -120,8 +120,6 @@ end @inline bits_unused(T::Type{<:Kmer}) = n_unused(T) * BioSequences.bits_per_symbol(Alphabet(T)) -@inline BioSequences.Alphabet(::Kmer{A}) where {A} = A() - @inline function n_coding_elements(::Type{<:Kmer{A, K}}) where {A, K} cld(BioSequences.bits_per_symbol(A()) * K, 8 * sizeof(UInt)) end @@ -168,13 +166,21 @@ function Base.show(io::IO, ::MIME"text/plain", s::Kmer) print(io, s) end +# TODO: This is only efficient because the compiler, through Herculean effort, +# is able to completely unroll and inline the indexing operation. @inline function _cmp(x::Kmer{A1, K1}, y::Kmer{A2, K2}) where {A1, A2, K1, K2} - if K1 < K2 - -1 - elseif K2 < K1 - 1 - else + if K1 == K2 cmp(x.data, y.data) + else + m = min(K1, K2) + a = @inline x[1:m] + b = @inline y[1:m] + c = cmp(a.data, b.data) + if iszero(c) + K1 < K2 ? -1 : K2 < K1 ? 1 : 0 + else + c + end end end @@ -184,14 +190,66 @@ end Base.cmp(x::Kmer{A}, y::Kmer{A}) where {A} = _cmp(x, y) Base.cmp(x::Kmer{<:FourBit}, y::Kmer{<:FourBit}) = _cmp(x, y) Base.cmp(x::Kmer{<:TwoBit}, y::Kmer{<:TwoBit}) = _cmp(x, y) +Base.cmp(x::Kmer{A}, y::Kmer{B}) where {A, B} = throw(MethodError(cmp, (x, y))) -Base.isless(x::Kmer, y::Kmer) = cmp(x, y) == -1 -Base.:(==)(x::Kmer, y::Kmer) = iszero(cmp(x, y)) +Base.isless(x::Kmer, y::Kmer) = @inline(cmp(x, y)) == -1 +Base.:(==)(x::Kmer, y::Kmer) = iszero(@inline cmp(x, y)) Base.:(==)(x::Kmer, y::BioSequence) = throw(MethodError(==, (x, y))) Base.:(==)(x::BioSequence, y::Kmer) = throw(MethodError(==, (x, y))) -Base.hash(x::Kmer{A, K, N}, h::UInt) where {A, K, N} = hash(x.data, h ⊻ K) +Base.hash(x::Kmer, h::UInt) = hash(x.data, h ⊻ ksize(typeof(x))) + +# These constants are from the original implementation +@static if Sys.WORD_SIZE == 32 + # typemax(UInt32) / golden ratio + const FX_CONSTANT = 0x9e3779b9 +elseif Sys.WORD_SIZE == 64 + # typemax(UInt64) / pi + const FX_CONSTANT = 0x517cc1b727220a95 +else + error("Invalid word size") +end + +# This implementation is translated from the Rust compiler source code, +# licenced under MIT. The original source is the Firefox source code, +# also freely licensed. +""" + fx_hash(x, [h::UInt])::UInt + +An implementation of `FxHash`. This hash function is extremely fast, but the hashes +are of poor quality compared to Julia's default MurmurHash3. In particular: +* The value of any particular bit in the output depends only on bits in the same, + and lower positions +* The bitpattern zero hashes to zero + +However, for many applications, `FxHash` is good enough, where the higher rate of +hash collisions are offset by the faster speed. + +The precise hash value of a given kmer is not guaranteed to be stable across minor +releases of Kmers.jl, but _is_ guaranteed to be stable across minor versions of +Julia. + +# Examples +```jldoctest +julia> x = fx_hash(mer"KWQLDE"a); + +julia> y = fx_hash(mer"KWQLDE"a, UInt(1)); + +julia> x isa UInt +true + +julia> x == y +false +``` +""" +function fx_hash(x::Kmer, h::UInt) + for i in x.data + h = (bitrotate(h, 5) ⊻ i) * FX_CONSTANT + end + h +end +fx_hash(x) = fx_hash(x, zero(UInt)) """ push(kmer::Kmer{A, K}, s)::Kmer{A, K+1} diff --git a/test/access.jl b/test/access.jl deleted file mode 100644 index 71dd487..0000000 --- a/test/access.jl +++ /dev/null @@ -1,94 +0,0 @@ -@testset "Access and Iterations" begin - dna_kmer = mer"ACTG"dna - rna_kmer = mer"ACUG"rna - aa_kmer = mer"MVXN"aa - - @testset "Access DNA Kmer" begin - @test dna_kmer[1] == DNA_A - @test dna_kmer[2] == DNA_C - @test dna_kmer[3] == DNA_T - @test dna_kmer[4] == DNA_G - - @test dna_kmer[1:3] == mer"ACT"dna - @test dna_kmer[2:4] == mer"CTG"dna - - # Access indexes out of bounds - @test_throws BoundsError dna_kmer[-1] - @test_throws BoundsError dna_kmer[0] - @test_throws BoundsError dna_kmer[5] - @test_throws BoundsError getindex(dna_kmer, -1) - @test_throws BoundsError getindex(dna_kmer, 0) - @test_throws BoundsError getindex(dna_kmer, 5) - @test_throws BoundsError dna_kmer[3:7] - end - - @testset "Iteration through DNA Kmer" begin - @test iterate(DNAKmer("ACTG")) == (DNA_A, 2) - - @test iterate(DNAKmer("ACTG"), 1) == (DNA_A, 2) - @test iterate(DNAKmer("ACTG"), 4) == (DNA_G, 5) - - @test iterate(DNAKmer("ACTG"), 1) !== nothing - @test iterate(DNAKmer("ACTG"), 4) !== nothing - @test iterate(DNAKmer("ACTG"), 5) === nothing - @test isnothing(iterate(DNAKmer("ACTG"), -1)) - @test iterate(DNAKmer("ACTG"), 0) === nothing - - dna_vec = [DNA_A, DNA_C, DNA_T, DNA_G] - @test all([nt === dna_vec[i] for (i, nt) in enumerate(dna_kmer)]) - end - - @testset "Access RNA Kmer" begin - @test rna_kmer[1] == RNA_A - @test rna_kmer[2] == RNA_C - @test rna_kmer[3] == RNA_U - @test rna_kmer[4] == RNA_G - - @test rna_kmer[1:3] == mer"ACU"rna - @test rna_kmer[2:4] == mer"CUG"rna - - # Access indexes out of bounds - @test_throws BoundsError rna_kmer[-1] - @test_throws BoundsError rna_kmer[0] - @test_throws BoundsError rna_kmer[5] - @test_throws BoundsError getindex(rna_kmer, -1) - @test_throws BoundsError getindex(rna_kmer, 0) - @test_throws BoundsError getindex(rna_kmer, 5) - @test_throws BoundsError rna_kmer[3:7] - end - - @testset "Iteration through RNA Kmer" begin - @test iterate(RNAKmer("ACUG")) == (RNA_A, 2) - - @test iterate(RNAKmer("ACUG"), 1) == (RNA_A, 2) - @test iterate(RNAKmer("ACUG"), 4) == (RNA_G, 5) - - @test iterate(RNAKmer("ACUG"), 1) !== nothing - @test iterate(RNAKmer("ACUG"), 4) !== nothing - @test iterate(RNAKmer("ACUG"), 5) === nothing - @test iterate(RNAKmer("ACUG"), -1) === nothing - @test iterate(RNAKmer("ACUG"), 0) === nothing - - rna_vec = [RNA_A, RNA_C, RNA_U, RNA_G] - @test all([nt === rna_vec[i] for (i, nt) in enumerate(rna_kmer)]) - end - - @testset "Access AA Kmer" begin - @test aa_kmer[1] == AA_M - @test aa_kmer[2] == AA_V - @test aa_kmer[3] == AA_X - @test aa_kmer[4] == AA_N - - @test aa_kmer[1:3] == mer"MVX"aa - @test aa_kmer[2:4] == mer"VXN"aa - - # Access indexes out of bounds - @test_throws BoundsError aa_kmer[-1] - @test_throws BoundsError aa_kmer[0] - @test_throws BoundsError aa_kmer[5] - @test_throws BoundsError getindex(aa_kmer, -1) - @test_throws BoundsError getindex(aa_kmer, 0) - @test_throws BoundsError getindex(aa_kmer, 5) - @test_throws BoundsError aa_kmer[3:7] - end -end diff --git a/test/comparisons.jl b/test/comparisons.jl deleted file mode 100644 index 9c84e05..0000000 --- a/test/comparisons.jl +++ /dev/null @@ -1,64 +0,0 @@ -@testset "Comparisons" begin - @testset "Equality" begin - function check_seq_kmer_equality(len) - a = DNAKmer(random_dna_kmer(len)) - b = LongDNA{4}(a) - c = LongDNA{2}(a) - return a == b == c && c == b == a - end - - for len in [1, 10, 32, 64, 128] - @test all(Bool[check_seq_kmer_equality(len) for _ in 1:reps]) - end - - # True negatives - @test DNAKmer("ACG") != RNAKmer("ACG") - @test DNAKmer("T") != RNAKmer("U") - @test DNAKmer("AC") != DNAKmer("AG") - @test RNAKmer("AC") != RNAKmer("AG") - @test AAKmer("MV") != AAKmer("NM") - - @test DNAKmer("ACG") != rna"ACG" - @test DNAKmer("T") != rna"U" - @test DNAKmer("AC") != dna"AG" - @test RNAKmer("AC") != rna"AG" - @test AAKmer("MV") != aa"NM" - - @test rna"ACG" != DNAKmer("ACG") - @test rna"U" != DNAKmer("T") - @test dna"AG" != DNAKmer("AC") - @test rna"AG" != RNAKmer("AC") - @test aa"MV" != AAKmer("NM") - end - - @testset "Inequality" begin - for len in [1, 10, 32, 64] - if len <= 32 - @test isless(DNAKmer{1}((UInt64(0),)), DNAKmer{1}((UInt64(1),))) - @test !isless(DNAKmer{1}((UInt64(0),)), DNAKmer{1}((UInt64(0),))) - @test !isless(DNAKmer{1}((UInt64(1),)), DNAKmer{1}((UInt64(0),))) - - @test isless(RNAKmer{1}((UInt64(0),)), RNAKmer{1}((UInt64(1),))) - @test !isless(RNAKmer{1}((UInt64(0),)), RNAKmer{1}((UInt64(0),))) - @test !isless(RNAKmer{1}((UInt64(1),)), RNAKmer{1}((UInt64(0),))) - end - end - end - - @testset "Hash" begin - kmers = map(DNAKmer, ["AAAA", "AACT", "ACGT", "TGCA"]) - for x in kmers, y in kmers - @test (x == y) == (hash(x) == hash(y)) - end - - kmers = map(RNAKmer, ["AAAA", "AACU", "ACGU", "UGCA"]) - for x in kmers, y in kmers - @test (x == y) == (hash(x) == hash(y)) - end - - kmers = map(AAKmer, ["AMVK", "FPST", "QEGH", "ARND"]) - for x in kmers, y in kmers - @test (x == y) == (hash(x) == hash(y)) - end - end -end diff --git a/test/construction_and_conversion.jl b/test/construction_and_conversion.jl deleted file mode 100644 index fb7f47d..0000000 --- a/test/construction_and_conversion.jl +++ /dev/null @@ -1,473 +0,0 @@ -global reps = 10 - -@testset "Construction and Conversions" begin - @test Kmer(DNA_A, DNA_G, DNA_T) === Kmer("AGT") - @test Kmer(RNA_A, RNA_G, RNA_U) === Kmer("AGU") - #@test Kmer(AA_R, AA_D, AA_C, AA_B) === Kmer("RDCB") - - @test DNAKmer(DNA_G, DNA_C, DNA_T) == Kmer("GCT") - @test RNAKmer(RNA_G, RNA_U, RNA_C, RNA_U) == Kmer("GUCU") - - # creation from iterator - @test Kmers.kmertype(Kmer{DNAAlphabet{2}, 31})((i for i in rand(ACGT, 31))) isa - Kmers.kmertype(Kmer{DNAAlphabet{2}, 31}) - - # Check that kmers in strings survive round trip conversion: - # String → Kmer → String - function check_string_construction(::Type{T}, seq::AbstractString) where {T <: Kmer} - return String(T(seq)) == uppercase(seq) - end - - # Check that RNAKmers can be constructed from a LongRNASeq - # LongSequence{A} → Kmer{A,K,N} → LongSequence{A} - function check_longsequence_construction( - ::Type{T}, - seq::S, - ) where {T <: Kmer, S <: LongSequence} - return S(T(seq)) == seq - end - - # Check that kmers can be constructed from a BioSequence - # BioSequence → Kmer → BioSequence - function check_biosequence_construction(::Type{T}, seq::LongSequence) where {T <: Kmer} - return LongSequence(T(seq)) == seq - end - - # Check that kmers can be constructed from an array of nucleotides - # Vector{T} → Kmer → Vector{T} - function check_nucarray_kmer(::Type{M}, seq::Vector{T}) where {T, M <: Kmer} - return String([convert(Char, c) for c in seq]) == String(M(seq)) - end - - # Check that kmers in strings survive round trip conversion: - # String → BioSequence → Kmer → BioSequence → String - function check_roundabout_construction( - ::Type{T}, - A2, - seq::AbstractString, - ) where {T <: Kmer} - return String(LongSequence{A2}(T(LongSequence{A2}(seq)))) == uppercase(seq) - end - - #= - function check_uint_conversion(::Type{T}) where {T<:Kmer} - U = BioSequences.encoded_data_type(T) - uint = rand(typemin(U):U(one(U) << 2BioSequences.ksize(T) - 1)) - return convert(U, T(uint)) === uint - end - =# - - @testset "Kmer conversion" begin - for len in [1, 16, 32, 64, 128] - # String construction - # Check that kmers in strings survive round trip conversion: - # String → Kmer → String - @test all( - Bool[ - check_string_construction(DNAKmer{len}, random_dna_kmer(len)) for - _ in 1:reps - ], - ) - @test all( - Bool[ - check_string_construction( - Kmer{DNAAlphabet{4}, len}, - random_dna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_string_construction(RNAKmer{len}, random_rna_kmer(len)) for - _ in 1:reps - ], - ) - @test all( - Bool[ - check_string_construction( - Kmer{RNAAlphabet{4}, len}, - random_rna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_string_construction(AAKmer{len}, random_aa(len)) for _ in 1:reps - ], - ) - - # Long(DNA|RNA)Seq Constructions - # Check that DNAKmers can be constructed from a Long(DNA|RNA)Seq - # Long(DNA|RNA)Seq → Kmer → Long(DNA|RNA)Seq - @test all( - Bool[ - check_longsequence_construction( - Kmer{DNAAlphabet{2}, len}, - LongDNA{2}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{DNAAlphabet{4}, len}, - LongDNA{4}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{DNAAlphabet{4}, len}, - LongDNA{2}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{DNAAlphabet{2}, len}, - LongDNA{4}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{RNAAlphabet{2}, len}, - LongRNA{2}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{RNAAlphabet{4}, len}, - LongRNA{4}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{RNAAlphabet{4}, len}, - LongRNA{2}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{RNAAlphabet{2}, len}, - LongRNA{4}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction(AAKmer{len}, LongAA(random_aa(len))) for - _ in 1:reps - ], - ) - - # Check Kmer{A1}(::BioSequence{A2}) for compatible A1 and A2 - @test all( - Bool[ - check_longsequence_construction( - Kmer{RNAAlphabet{4}}, - LongRNA{2}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{RNAAlphabet{2}}, - LongDNA{4}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{RNAAlphabet{4}}, - LongDNA{4}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction( - Kmer{DNAAlphabet{2}}, - LongRNA{4}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - - # BioSequence Construction - # Check that kmers can be constructed from a BioSequence - # BioSequence → Kmer → BioSequence - @test all( - Bool[ - check_biosequence_construction( - Kmer{DNAAlphabet{2}, len}, - LongSequence{DNAAlphabet{2}}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_biosequence_construction( - Kmer{DNAAlphabet{4}, len}, - LongSequence{DNAAlphabet{4}}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_biosequence_construction( - Kmer{DNAAlphabet{2}, len}, - LongSequence{DNAAlphabet{4}}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_biosequence_construction( - Kmer{DNAAlphabet{4}, len}, - LongSequence{DNAAlphabet{2}}(random_dna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_biosequence_construction( - Kmer{RNAAlphabet{2}, len}, - LongSequence{RNAAlphabet{2}}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_biosequence_construction( - Kmer{RNAAlphabet{4}, len}, - LongSequence{RNAAlphabet{4}}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_biosequence_construction( - Kmer{RNAAlphabet{2}, len}, - LongSequence{RNAAlphabet{4}}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_biosequence_construction( - Kmer{RNAAlphabet{4}, len}, - LongSequence{RNAAlphabet{2}}(random_rna_kmer(len)), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_biosequence_construction( - AAKmer{len}, - LongSequence{AminoAcidAlphabet}(random_aa(len)), - ) for _ in 1:reps - ], - ) - - # Check Kmer(::BioSequence) construction - @test all( - Bool[ - check_longsequence_construction(Kmer, LongRNA{4}(random_rna_kmer(len))) - for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction(Kmer, LongDNA{2}(random_dna_kmer(len))) - for _ in 1:reps - ], - ) - @test all( - Bool[ - check_longsequence_construction(Kmer, LongAA(random_rna_kmer(len))) for - _ in 1:reps - ], - ) - - # Construction from element arrays - # Check that kmers can be constructed from an array of elements - # Vector{T} → Kmer{A,K,N} → Vector{T} - @test all( - Bool[ - check_nucarray_kmer( - Kmer{DNAAlphabet{2}, len}, - random_dna_symbols(len, [0.25, 0.25, 0.25, 0.25, 0.0]), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_nucarray_kmer(Kmer{DNAAlphabet{4}, len}, random_dna_symbols(len)) - for _ in 1:reps - ], - ) - @test all( - Bool[ - check_nucarray_kmer( - Kmer{RNAAlphabet{2}, len}, - random_rna_symbols(len, [0.25, 0.25, 0.25, 0.25, 0.0]), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_nucarray_kmer(Kmer{RNAAlphabet{4}, len}, random_rna_symbols(len)) - for _ in 1:reps - ], - ) - @test all( - Bool[ - check_nucarray_kmer(AAKmer{len}, random_aa_symbols(len)) for _ in 1:reps - ], - ) - - # Roundabout conversions - @test all( - Bool[ - check_roundabout_construction( - Kmer{DNAAlphabet{2}, len}, - DNAAlphabet{2}, - random_dna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_roundabout_construction( - Kmer{DNAAlphabet{4}, len}, - DNAAlphabet{4}, - random_dna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_roundabout_construction( - Kmer{DNAAlphabet{2}, len}, - DNAAlphabet{4}, - random_dna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_roundabout_construction( - Kmer{DNAAlphabet{4}, len}, - DNAAlphabet{2}, - random_dna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_roundabout_construction( - Kmer{RNAAlphabet{2}, len}, - RNAAlphabet{2}, - random_rna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_roundabout_construction( - Kmer{RNAAlphabet{4}, len}, - RNAAlphabet{4}, - random_rna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_roundabout_construction( - Kmer{RNAAlphabet{2}, len}, - RNAAlphabet{4}, - random_rna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_roundabout_construction( - Kmer{RNAAlphabet{4}, len}, - RNAAlphabet{2}, - random_rna_kmer(len), - ) for _ in 1:reps - ], - ) - @test all( - Bool[ - check_roundabout_construction( - AAKmer{len}, - AminoAcidAlphabet, - random_aa(len), - ) for _ in 1:reps - ], - ) - end - end - - @test_throws MethodError Kmer() # can't construct 0-mer using `Kmer()` - @test_throws ArgumentError DNAKmer(dna"") # 0-mers not allowed - @test_throws ArgumentError AAKmer(aa"") # 0-mers not allowed - @test_throws ArgumentError DNAKmer{0}(UInt64(0)) # 0-mers not allowed - @test_throws ArgumentError RNAKmer{0}(UInt64(0)) # 0-mers not allowed - @test_throws ArgumentError AAKmer{0}(UInt64(0)) # 0-mers not allowed - @test_throws BioSequences.EncodeError Kmer(RNA_A, RNA_C, RNA_G, RNA_N, RNA_U) # no Ns in kmers - @test_throws BioSequences.EncodeError Kmer(DNA_A, DNA_C, DNA_G, DNA_N, DNA_T) # no Ns in kmers - @test_throws BioSequences.EncodeError RNAKmer(rna"ACGNU") # no Ns in 2-bit nucleic acid kmers - @test_throws BioSequences.EncodeError DNAKmer(dna"ACGNT") # no Ns in 2-bit nucleic acid kmers - @test_throws MethodError Kmer(RNA_A, DNA_A) # no mixing of RNA and DNA - - @testset "From strings" begin - @test DNAKmer("ACTG") == DNAKmer(LongDNA{4}("ACTG")) - @test RNAKmer("ACUG") == RNAKmer(LongRNA{4}("ACUG")) - - # N is not allowed in Kmers - @test_throws Exception DNAMmer("ACGTNACGT") - @test_throws Exception RNAKmer("ACGUNACGU") - - # Test string literals - @test mer"ACTG"dna == DNAKmer(LongDNA{4}("ACTG")) - @test mer"AVBM"aa == AAKmer(LongAA("AVBM")) - @test isa(mer"ACGT"dna, DNAKmer{4}) - @test isa(mer"AVBM"aa, AAKmer{4}) - @test_throws LoadError eval(:(mer"ACGN"dna)) - @test_throws LoadError eval(:(mer"ACG-"dna)) - end - - @testset "Capacity" begin - @test Kmers.capacity(DNAKmer(random_dna_kmer(10))) == 32 - @test Kmers.capacity(RNAKmer(random_rna_kmer(10))) == 32 - @test Kmers.capacity(DNAKmer(random_dna_kmer(32))) == 32 - @test Kmers.capacity(RNAKmer(random_rna_kmer(32))) == 32 - @test Kmers.capacity(DNAKmer(random_dna_kmer(33))) == 64 - @test Kmers.capacity(AAKmer(random_aa(8))) == 8 - @test Kmers.capacity(AAKmer(random_aa(10))) == 16 - end - - @testset "N unused" begin - @test Kmers.n_unused(DNAKmer(random_dna_kmer(10))) == 22 - @test Kmers.n_unused(RNAKmer(random_rna_kmer(10))) == 22 - @test Kmers.n_unused(DNAKmer(random_dna_kmer(32))) == 0 - @test Kmers.n_unused(RNAKmer(random_rna_kmer(32))) == 0 - @test Kmers.n_unused(DNAKmer(random_dna_kmer(33))) == 31 - @test Kmers.n_unused(AAKmer(random_aa(8))) == 0 - @test Kmers.n_unused(AAKmer(random_aa(10))) == 6 - end -end diff --git a/test/debruijn_neighbors.jl b/test/debruijn_neighbors.jl deleted file mode 100644 index 1f92fd6..0000000 --- a/test/debruijn_neighbors.jl +++ /dev/null @@ -1,10 +0,0 @@ -@testset "De Bruijn Neighbors" begin - @test collect(fw_neighbors(DNAKmer("ACG"))) == - map(DNAKmer, ["CGA", "CGC", "CGG", "CGT"]) - @test collect(fw_neighbors(DNAKmer("GGGG"))) == - map(DNAKmer, ["GGGA", "GGGC", "GGGG", "GGGT"]) - @test collect(fw_neighbors(RNAKmer("ACG"))) == - map(RNAKmer, ["CGA", "CGC", "CGG", "CGU"]) - @test collect(fw_neighbors(RNAKmer("GGGG"))) == - map(RNAKmer, ["GGGA", "GGGC", "GGGG", "GGGU"]) -end diff --git a/test/find.jl b/test/find.jl deleted file mode 100644 index b8cab44..0000000 --- a/test/find.jl +++ /dev/null @@ -1,46 +0,0 @@ -@testset "Find" begin - kmer = DNAKmer("ACGAG") - - @test findnext(DNA_A, kmer, 1) == 1 - @test findnext(DNA_C, kmer, 1) == 2 - @test findnext(DNA_G, kmer, 1) == 3 - @test findnext(DNA_T, kmer, 1) == nothing - @test findnext(DNA_A, kmer, 2) == 4 - - @test_throws BoundsError findnext(DNA_A, kmer, 0) - @test findnext(DNA_A, kmer, 6) === nothing - - @test findprev(DNA_A, kmer, 5) == 4 - @test findprev(DNA_C, kmer, 5) == 2 - @test findprev(DNA_G, kmer, 5) == 5 - @test findprev(DNA_T, kmer, 5) == nothing - @test findprev(DNA_G, kmer, 4) == 3 - - @test findprev(DNA_A, kmer, 0) === nothing - @test_throws BoundsError findprev(DNA_A, kmer, 6) - - @test findfirst(DNA_A, kmer) == 1 - @test findfirst(DNA_G, kmer) == 3 - @test findlast(DNA_A, kmer) == 4 - @test findlast(DNA_G, kmer) == 5 - - kmer = AAKmer("AMVKFPSMT") - - @test findnext(AA_A, kmer, 1) == 1 - @test findnext(AA_M, kmer, 1) == 2 - @test findnext(AA_V, kmer, 1) == 3 - @test findnext(AA_K, kmer, 1) == 4 - @test findnext(AA_F, kmer, 1) == 5 - @test findnext(AA_P, kmer, 1) == 6 - @test findnext(AA_S, kmer, 1) == 7 - @test findnext(AA_M, kmer, 1) == 2 - @test findnext(AA_T, kmer, 1) == 9 - - @test findnext(AA_F, kmer, 4) == 5 - @test findprev(AA_F, kmer, 4) == nothing - @test findnext(AA_A, kmer, 7) == nothing - @test findnext(AA_M, kmer, 5) == 8 - - @test findfirst(AA_M, kmer) == 2 - @test findlast(AA_M, kmer) == 8 -end diff --git a/test/iteration.jl b/test/iteration.jl deleted file mode 100644 index 098eab5..0000000 --- a/test/iteration.jl +++ /dev/null @@ -1,236 +0,0 @@ -@testset "EveryKmer" begin - @testset "EveryKmer DNA" begin - s = randdnaseq(500) - s2 = LongDNA{2}(s) - # Kmer and sequence Alphabets match. - @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) - @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - - @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) - @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 - - # Kmer and sequence Alphabets mismatch. - s3 = dna"AC-TGAG--TGC" - @test collect(EveryKmer{DNACodon}(s3)) == [ - (UInt64(4), Kmer(DNA_T, DNA_G, DNA_A)), - (UInt64(5), Kmer(DNA_G, DNA_A, DNA_G)), - (UInt64(10), Kmer(DNA_T, DNA_G, DNA_C)), - ] - end - - @testset "EveryKmer RNA" begin - s = randrnaseq(500) - s2 = LongRNA{2}(s) - # Kmer and sequence Alphabets match. - @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) - @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - - @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) - @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 - - # Kmer and sequence Alphabets mismatch. - s3 = rna"AC-UGAG--UGC" - @test collect(EveryKmer{RNACodon}(s3)) == [ - (UInt64(4), Kmer(RNA_U, RNA_G, RNA_A)), - (UInt64(5), Kmer(RNA_G, RNA_A, RNA_G)), - (UInt64(10), Kmer(RNA_U, RNA_G, RNA_C)), - ] - end - - @testset "EveryKmer AA" begin - s = randaaseq(500) - s2 = LongAA(s) - @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) - @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - - @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) - @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 - end -end - -@testset "SpacedKmers" begin - @testset "SpacedKmers DNA" begin - s = randdnaseq(500) - s2 = LongDNA{2}(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == - collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == - length(SpacedKmers(s2, Val{31}(), 50)) == - 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == - collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == - length(SpacedKmers(s2, Val{201}(), 50)) == - 6 - - s3 = dna"AC-TGAG--TGC" - @test collect(SpacedKmers{DNACodon}(s3, 3)) == [ - (UInt64(4), Kmer(DNA_T, DNA_G, DNA_A)), - (UInt64(10), Kmer(DNA_T, DNA_G, DNA_C)), - ] - end - - @testset "SpacedKmers RNA" begin - s = randrnaseq(500) - s2 = LongRNA{2}(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == - collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == - length(SpacedKmers(s2, Val{31}(), 50)) == - 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == - collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == - length(SpacedKmers(s2, Val{201}(), 50)) == - 6 - - s3 = rna"AC-UGAG--UGC" - @test collect(SpacedKmers{RNACodon}(s3, 3)) == [ - (UInt64(4), Kmer(RNA_U, RNA_G, RNA_A)), - (UInt64(10), Kmer(RNA_U, RNA_G, RNA_C)), - ] - end - - @testset "SpacedKmers AA" begin - s = randaaseq(500) - s2 = LongAA(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == - collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == - length(SpacedKmers(s2, Val{31}(), 50)) == - 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == - collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == - length(SpacedKmers(s2, Val{201}(), 50)) == - 6 - end -end - -@testset "EveryCanonicalKmer" begin - @testset "EveryCanonicalKmer DNA" begin - s = randdnaseq(500) - s2 = LongDNA{2}(s) - - # Iterator generates expected results... - ## 2-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{31}())] == collect(EveryCanonicalKmer(s2, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{201}())] == collect(EveryCanonicalKmer(s2, Val{201}())) - - ## 4-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{31}())] == collect(EveryCanonicalKmer(s, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{201}())] == collect(EveryCanonicalKmer(s, Val{201}())) - - # Test equivalency between different levels of bit compression... - @test [x[2] for x in EveryCanonicalKmer(s, Val{31}())] == [x[2] for x in EveryCanonicalKmer(s2, Val{31}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{31}())])) && all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{31}())])) - - @test [x[2] for x in EveryCanonicalKmer(s, Val{201}())] == [x[2] for x in EveryCanonicalKmer(s2, Val{201}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{201}())])) && all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{201}())])) - - # Kmer and sequence Alphabets mismatch. - s3 = dna"AC-TGAG--TGC" - @test collect(EveryCanonicalKmer{DNACodon}(s3)) == [ - (UInt64(4), canonical(Kmer(DNA_T, DNA_G, DNA_A))), - (UInt64(5), canonical(Kmer(DNA_G, DNA_A, DNA_G))), - (UInt64(10), canonical(Kmer(DNA_T, DNA_G, DNA_C))), - ] - end - - @testset "EveryCanonicalKmer RNA" begin - s = randrnaseq(500) - s2 = LongRNA{2}(s) - - # Iterator generates expected results... - ## 2-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{31}())] == collect(EveryCanonicalKmer(s2, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{201}())] == collect(EveryCanonicalKmer(s2, Val{201}())) - - ## 4-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{31}())] == collect(EveryCanonicalKmer(s, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{201}())] == collect(EveryCanonicalKmer(s, Val{201}())) - - # Test equivalency between different levels of bit compression... - @test [x[2] for x in EveryCanonicalKmer(s, Val{31}())] == [x[2] for x in EveryCanonicalKmer(s2, Val{31}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{31}())])) && all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{31}())])) - - @test [x[2] for x in EveryCanonicalKmer(s, Val{201}())] == [x[2] for x in EveryCanonicalKmer(s2, Val{201}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{201}())])) && all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{201}())])) - - s3 = rna"AC-UGAG--UGC" - @test collect(EveryCanonicalKmer{RNACodon}(s3)) == [ - (UInt64(4), canonical(Kmer(RNA_U, RNA_G, RNA_A))), - (UInt64(5), canonical(Kmer(RNA_G, RNA_A, RNA_G))), - (UInt64(10), canonical(Kmer(RNA_U, RNA_G, RNA_C))), - ] - end -end - -@testset "SpacedCanonicalKmers" begin - @testset "SpacedCanonicalKmers DNA" begin - s = randdnaseq(500) - s2 = LongDNA{2}(s) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{31}(), 50)) == - collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{31}(), 50)) == - length(SpacedCanonicalKmers(s2, Val{31}(), 50)) == - 10 - - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{201}(), 50)) == - collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{201}(), 50)) == - length(SpacedCanonicalKmers(s2, Val{201}(), 50)) == - 6 - - s3 = dna"AC-TGAG--TGC" - @test collect(SpacedCanonicalKmers{DNACodon}(s3, 3)) == [ - (UInt64(4), canonical(Kmer(DNA_T, DNA_C, DNA_A))), - (UInt64(10), canonical(Kmer(DNA_T, DNA_G, DNA_C))), - ] - end - - @testset "SpacedCanonicalKmers RNA" begin - s = randrnaseq(500) - s2 = LongRNA{2}(s) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{31}(), 50)) == - collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{31}(), 50)) == - length(SpacedCanonicalKmers(s2, Val{31}(), 50)) == - 10 - - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{201}(), 50)) == - collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{201}(), 50)) == - length(SpacedCanonicalKmers(s2, Val{201}(), 50)) == - 6 - - s3 = rna"AC-UGAG--UGC" - @test collect(SpacedCanonicalKmers{RNACodon}(s3, 3)) == [ - (UInt64(4), canonical(Kmer(RNA_U, RNA_C, RNA_A))), - (UInt64(10), canonical(Kmer(RNA_U, RNA_G, RNA_C))), - ] - end -end diff --git a/test/length.jl b/test/length.jl deleted file mode 100644 index 598980e..0000000 --- a/test/length.jl +++ /dev/null @@ -1,7 +0,0 @@ -@testset "Length" begin - for len in [1, 16, 32, 64, 128] - @test length(DNAKmer(random_dna_kmer(len))) == len - @test length(RNAKmer(random_rna_kmer(len))) == len - @test length(AAKmer(random_aa(len))) == len - end -end diff --git a/test/mismatches.jl b/test/mismatches.jl deleted file mode 100644 index 620797c..0000000 --- a/test/mismatches.jl +++ /dev/null @@ -1,51 +0,0 @@ -@testset "Mismatches" begin - function test_mismatches(a, b) - count = 0 - for (x, y) in zip(a, b) - count += x != y - end - @test mismatches(a, b) === mismatches(b, a) === count - end - - for len in 1:64, _ in 1:10 - a = random_dna_kmer(len) - b = random_dna_kmer(len) - test_mismatches(DNAKmer(a), DNAKmer(b)) - test_mismatches(Kmer{DNAAlphabet{4}}(a), Kmer{DNAAlphabet{4}}(b)) - - a = random_rna_kmer(len) - b = random_rna_kmer(len) - test_mismatches(RNAKmer(a), RNAKmer(b)) - test_mismatches(Kmer{RNAAlphabet{4}}(a), Kmer{RNAAlphabet{4}}(b)) - - a = AAKmer(random_aa(len)) - b = AAKmer(random_aa(len)) - test_mismatches(a, b) - end -end - -@testset "Matches" begin - function test_matches(a, b) - count = 0 - for (x, y) in zip(a, b) - count += x == y - end - @test matches(a, b) === matches(b, a) === count - end - - for len in 1:64, _ in 1:10 - a = random_dna_kmer(len) - b = random_dna_kmer(len) - test_matches(DNAKmer(a), DNAKmer(b)) - test_matches(Kmer{DNAAlphabet{4}}(a), Kmer{DNAAlphabet{4}}(b)) - - a = random_rna_kmer(len) - b = random_rna_kmer(len) - test_matches(RNAKmer(a), RNAKmer(b)) - test_matches(Kmer{RNAAlphabet{4}}(a), Kmer{RNAAlphabet{4}}(b)) - - a = AAKmer(random_aa(len)) - b = AAKmer(random_aa(len)) - test_matches(a, b) - end -end diff --git a/test/order.jl b/test/order.jl deleted file mode 100644 index 64dc243..0000000 --- a/test/order.jl +++ /dev/null @@ -1,15 +0,0 @@ -@testset "Order" begin - @test DNAMer("AA") < DNAMer("AC") < DNAMer("AG") < DNAMer("AT") < DNAMer("CA") - @test RNAMer("AA") < RNAMer("AC") < RNAMer("AG") < RNAMer("AU") < RNAMer("CA") - - @test BigDNAMer("AA") < - BigDNAMer("AC") < - BigDNAMer("AG") < - BigDNAMer("AT") < - BigDNAMer("CA") - @test BigRNAMer("AA") < - BigRNAMer("AC") < - BigRNAMer("AG") < - BigRNAMer("AU") < - BigRNAMer("CA") -end diff --git a/test/print.jl b/test/print.jl deleted file mode 100644 index b96a80c..0000000 --- a/test/print.jl +++ /dev/null @@ -1,37 +0,0 @@ -@testset "Print" begin - buf = IOBuffer() - - print(buf, DNAKmer("ACGT")) - @test String(take!(buf)) == "ACGT" - - print(buf, RNAKmer("ACGU")) - @test String(take!(buf)) == "ACGU" - - print(buf, Kmer{DNAAlphabet{4}}("ACGT")) - @test String(take!(buf)) == "ACGT" - - print(buf, Kmer{RNAAlphabet{4}}("ACGU")) - @test String(take!(buf)) == "ACGU" - - print(buf, AAKmer("AMVKFPSMT")) - @test String(take!(buf)) == "AMVKFPSMT" -end - -@testset "Show" begin - buf = IOBuffer() - - show(buf, DNAKmer("AGAGT")) - @test String(take!(buf)) == "AGAGT" - - show(buf, RNAKmer("AGAGU")) - @test String(take!(buf)) == "AGAGU" - - show(buf, Kmer{DNAAlphabet{4}}("AGAGT")) - @test String(take!(buf)) == "AGAGT" - - show(buf, Kmer{RNAAlphabet{4}}("AGAGU")) - @test String(take!(buf)) == "AGAGU" - - print(buf, AAKmer("AMVKFPSMT")) - @test String(take!(buf)) == "AMVKFPSMT" -end diff --git a/test/random.jl b/test/random.jl deleted file mode 100644 index 48074e7..0000000 --- a/test/random.jl +++ /dev/null @@ -1,26 +0,0 @@ -@testset "Random" begin - @testset for k in 1:64 - for _ in 1:10 - kmer = rand(DNAKmer{k}) - @test isa(kmer, DNAKmer{k}) - kmer = rand(RNAKmer{k}) - @test isa(kmer, RNAKmer{k}) - end - for size in [0, 1, 2, 5, 10, 100] - @test length(rand(DNAKmer{k}, size)) == size - @test length(rand(RNAKmer{k}, size)) == size - end - kmers = rand(DNAKmer{k}, 10_000) - for i in 1:k - a = sum([kmer[i] for kmer in kmers] .== DNA_A) - c = sum([kmer[i] for kmer in kmers] .== DNA_C) - g = sum([kmer[i] for kmer in kmers] .== DNA_G) - t = sum([kmer[i] for kmer in kmers] .== DNA_T) - @test 2200 ≤ a ≤ 2800 - @test 2200 ≤ c ≤ 2800 - @test 2200 ≤ g ≤ 2800 - @test 2200 ≤ t ≤ 2800 - @test a + c + g + t == 10_000 - end - end -end diff --git a/test/shuffle.jl b/test/shuffle.jl deleted file mode 100644 index 793081b..0000000 --- a/test/shuffle.jl +++ /dev/null @@ -1,25 +0,0 @@ -@testset "Shuffle" begin - for s in ["A", "C", "G", "T"] - kmer = DNAKmer(s) - @test kmer === shuffle(kmer) - end - - function count(kmer) - a = c = g = t = 0 - for x in kmer - a += x == DNA_A - c += x == DNA_C - g += x == DNA_G - t += x == DNA_T - end - return a, c, g, t - end - - for k in 1:64, _ in 1:10 - kmer = rand(DNAKmer{k}) - @test count(kmer) == count(shuffle(kmer)) - if k ≥ 30 - @test kmer != shuffle(kmer) - end - end -end diff --git a/test/transformations.jl b/test/transformations.jl deleted file mode 100644 index e16c310..0000000 --- a/test/transformations.jl +++ /dev/null @@ -1,60 +0,0 @@ -@testset "Transformations" begin - function test_reverse(T, seq) - revseq = reverse(T(seq)) - @test String(revseq) == reverse(seq) - end - - function test_dna_complement(T, seq) - comp = complement(T(seq)) - @test String(comp) == dna_complement(seq) - end - - function test_rna_complement(T, seq) - comp = complement(T(seq)) - @test String(comp) == rna_complement(seq) - end - - function test_dna_revcomp(T, seq) - revcomp = reverse_complement(T(seq)) - @test String(revcomp) == reverse(dna_complement(seq)) - end - - function test_rna_revcomp(T, seq) - revcomp = reverse_complement(T(seq)) - @test String(revcomp) == reverse(rna_complement(seq)) - end - - @testset "Reverse" begin - for len in 1:64, _ in 1:10 - test_reverse(DNAKmer{len}, random_dna_kmer(len)) - test_reverse(RNAKmer{len}, random_rna_kmer(len)) - end - - seq = dna"AAAAAAAAAAAAAAAAAAAAAAAAAAAAGATAC" - @test reverse(seq[(length(seq) - 9):length(seq)]) == dna"CATAGAAAAA" - end - - @testset "Complement" begin - for len in 1:64, _ in 1:10 - test_dna_complement(DNAKmer{len}, random_dna_kmer(len)) - test_rna_complement(RNAKmer{len}, random_rna_kmer(len)) - end - end - - @testset "Reverse Complement" begin - for len in 1:64, _ in 1:10 - test_dna_revcomp(DNAKmer{len}, random_dna_kmer(len)) - test_rna_revcomp(RNAKmer{len}, random_rna_kmer(len)) - end - end - - @testset "Canonical" begin - @test canonical(DNAKmer{4, 1}("ACCG")) == DNAKmer{4, 1}("ACCG") - @test canonical(DNAKmer{4, 1}("GCAC")) == DNAKmer{4, 1}("GCAC") - @test canonical(RNAKmer{4, 1}("AAUU")) == RNAKmer{4, 1}("AAUU") - @test canonical(RNAKmer{4, 1}("UGGA")) == RNAKmer{4, 1}("UCCA") - @test canonical(RNAKmer{4, 1}("CGAU")) == RNAKmer{4, 1}("AUCG") - @test canonical(RNAKmer{4, 1}("UGGA")) == RNAKmer{4, 1}("UCCA") - @test canonical(DNAKmer{4, 1}("GCAC")) == DNAKmer{4, 1}("GCAC") - end -end From 9c79f6ac1eb500cd1a21b1baf31d92f57b048013 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 2 Jan 2024 20:10:28 +0100 Subject: [PATCH 29/33] Misc changes --- Project.toml | 8 +++++++ docs/Project.toml | 2 ++ docs/make.jl | 1 + docs/src/composition.md | 0 docs/src/minhash.md | 38 +++++++++++++++++++++++++++++++ ext/StringViewsExt.jl | 11 +++++++++ src/Kmers.jl | 5 +++- src/construction.jl | 14 +++++++++++- src/iterators/CanonicalKmers.jl | 4 ++-- src/iterators/FwKmers.jl | 2 +- src/iterators/SpacedKmers.jl | 2 +- src/iterators/UnambiguousKmers.jl | 32 ++++++++++++++++++-------- src/iterators/common.jl | 9 +++----- test/runtests.jl | 17 ++++---------- 14 files changed, 110 insertions(+), 35 deletions(-) create mode 100644 docs/src/composition.md create mode 100644 docs/src/minhash.md create mode 100644 ext/StringViewsExt.jl diff --git a/Project.toml b/Project.toml index 747fb56..4897116 100644 --- a/Project.toml +++ b/Project.toml @@ -6,14 +6,22 @@ authors = [ ] version = "1.0.0" +[weakdeps] +StringViews = "354b36f9-a18e-4713-926e-db85100087ba" + [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" +StringViews = "354b36f9-a18e-4713-926e-db85100087ba" + +[extensions] +StringViewsExt = "StringViews" [compat] BioSequences = "3.1.3" Random = "1.10" julia = "1.8" +StringViews = "1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/docs/Project.toml b/docs/Project.toml index 0c33fe1..2d21498 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,7 +1,9 @@ [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12" Kmers = "445028e4-d31f-4f27-89ad-17affd83fc22" +MinHash = "4b3c9753-2685-44e9-8a29-365b96c023ed" [compat] Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl index 75ec010..e0b4f59 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -18,6 +18,7 @@ makedocs(; "Translation" => "translation.md", "Hashing" => "hashing.md", "FAQ" => "faq.md", + "Cookbook" => ["MinHash" => "minhash.md", "Kmer composition" => "composition.md"], ], authors="Jakob Nybo Nissen, Sabrina J. Ward, The BioJulia Organisation and other contributors.", checkdocs=:exports, diff --git a/docs/src/composition.md b/docs/src/composition.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/src/minhash.md b/docs/src/minhash.md new file mode 100644 index 0000000..8b9d127 --- /dev/null +++ b/docs/src/minhash.md @@ -0,0 +1,38 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers + using FASTX + using MinHash +end +``` +## MinHash +The MinHash algorithm is used in tools such as [Mash](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x) and [sourmash](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6720031/) to quickly compute approximate similarities of genomes, collections of genomes, or collections of reads. + +```jldoctest; filter = r"^\d\d\d? MB/s$" => s"***" +using BioSequences, MinHash, FASTX, Kmers + +# Write 100 MB of DNA in 50 genomes to buffer +buffer = IOBuffer() +writer = FASTAWriter(buffer) +n_bytes = sum(1:50) do genome + rec = FASTARecord("seq_$(genome)", randdnaseq(2_000_000)) + write(writer, rec) +end +flush(writer) + +# Time minhashing the 50 genomes +timing = @timed FASTAReader(seekstart(buffer); copy=false) do reader + map(reader) do record + seq = codeunits(sequence(record)) + minhash(fx_hash, CanonicalDNAMers{16}(sequence(record)), 1000) + end +end +println(round(Int, n_bytes / (timing.time * 1e6)), " MB/s") + +# output + +200 MB/s +``` diff --git a/ext/StringViewsExt.jl b/ext/StringViewsExt.jl new file mode 100644 index 0000000..f674427 --- /dev/null +++ b/ext/StringViewsExt.jl @@ -0,0 +1,11 @@ +module StringViewsExt + +using StringViews: StringView +using Kmers: Kmers + +# This extension is important because FASTX uses string views. +# The documentation of StringViews promises that the underlying +# string is UTF-8 encoded. +Kmers.is_ascii(::Type{<:StringView}) = true + +end # module diff --git a/src/Kmers.jl b/src/Kmers.jl index 7d5e45e..e098a69 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -155,7 +155,6 @@ const unsafe = Unsafe() const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} -const Bytes = Union{String, SubString{String}, AbstractVector{UInt8}} const BitInteger = Union{Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128} @@ -172,4 +171,8 @@ include("iterators/CanonicalKmers.jl") include("iterators/UnambiguousKmers.jl") include("iterators/SpacedKmers.jl") +if !isdefined(Base, :get_extension) + include("../ext/StringViewsExt.jl") +end + end # module diff --git a/src/construction.jl b/src/construction.jl index ce4a8fb..fdf0475 100644 --- a/src/construction.jl +++ b/src/construction.jl @@ -22,6 +22,18 @@ struct AsciiEncode <: RecodingScheme end conversion to the right encoding" struct GenericRecoding <: RecodingScheme end +""" + is_ascii(::Type{T})::Bool + +Trait function. Should return `true` for `AbstractVector{UInt8}`, or for +string types for which `codeunits(s)` returns an `AbstractVector{UInt8}`, where +every ASCII byte in the string is perserved in the vector. +This is true for all UTF8, latin1 and ASCII encoded string types. +""" +is_ascii(::Type) = false +is_ascii(::Type{<:Union{String, SubString{String}}}) = true +is_ascii(::Type{<:AbstractVector{UInt8}}) = true + function RecodingScheme(A::Alphabet, source_type::Type)::RecodingScheme return if source_type <: BioSequence if BioSequences.encoded_data_eltype(source_type) <: BitInteger @@ -42,7 +54,7 @@ function RecodingScheme(A::Alphabet, source_type::Type)::RecodingScheme else GenericRecoding() end - elseif source_type <: Bytes && BioSequences.codetype(A) isa BioSequences.AsciiAlphabet + elseif is_ascii(source_type) && BioSequences.codetype(A) isa BioSequences.AsciiAlphabet AsciiEncode() else GenericRecoding() diff --git a/src/iterators/CanonicalKmers.jl b/src/iterators/CanonicalKmers.jl index 6ece49d..87a8c8e 100644 --- a/src/iterators/CanonicalKmers.jl +++ b/src/iterators/CanonicalKmers.jl @@ -60,7 +60,7 @@ end @inline function iterate_kmer( R::AsciiEncode, it::FwRvIterator{A, K, S}, -) where {A <: NucleicAcidAlphabet, K, S <: Bytes} +) where {A <: NucleicAcidAlphabet, K, S} src = used_source(RecodingScheme(A(), S), it.seq) Base.require_one_based_indexing(src) length(src) < K && return nothing @@ -169,7 +169,7 @@ end end """ - CanonicalKmers{A <: NucleicAcidAlphabet, K, S} + CanonicalKmers{A <: NucleicAcidAlphabet, K, S} <: AbstractKmerIterator{A, K} Iterator of canonical nucleic acid kmers. The result of this iterator is equivalent to calling `canonical` on each value of a `FwKmers` iterator, but may be more diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl index 4ed24d2..f199a14 100644 --- a/src/iterators/FwKmers.jl +++ b/src/iterators/FwKmers.jl @@ -63,7 +63,7 @@ end @inline function iterate_kmer( R::AsciiEncode, it::FwKmers{A, K, S}, -) where {A <: Alphabet, K, S <: Bytes} +) where {A <: Alphabet, K, S} src = used_source(RecodingScheme(A(), S), it.seq) Base.require_one_based_indexing(src) length(src) < K && return nothing diff --git a/src/iterators/SpacedKmers.jl b/src/iterators/SpacedKmers.jl index 792dfa6..13504ad 100644 --- a/src/iterators/SpacedKmers.jl +++ b/src/iterators/SpacedKmers.jl @@ -102,7 +102,7 @@ end @inline function iterate_kmer( R::AsciiEncode, it::SpacedKmers{A, K, J, S}, -) where {A <: Alphabet, K, J, S <: Bytes} +) where {A <: Alphabet, K, J, S} src = used_source(RecodingScheme(A(), S), it.seq) Base.require_one_based_indexing(src) length(src) < K && return nothing diff --git a/src/iterators/UnambiguousKmers.jl b/src/iterators/UnambiguousKmers.jl index 701678d..7c74f82 100644 --- a/src/iterators/UnambiguousKmers.jl +++ b/src/iterators/UnambiguousKmers.jl @@ -1,5 +1,5 @@ """ - UnambiguousKmers{A <: TwoBit, K, S} <: AbstractKmerIterator{A, K} + UnambiguousKmers{A <: TwoBit, K, S} Iterator of 2-bit nucleic acid kmers. This differs from `FwKmers` in that any kmers containing ambiguous nucleotides are skipped, whereas using `FwKmers`, they result @@ -23,11 +23,15 @@ julia> collect(it) CAUC ``` """ -struct UnambiguousKmers{A <: TwoBit, K, S} <: AbstractKmerIterator{A, K} +struct UnambiguousKmers{A <: TwoBit, K, S} it::FwKmers{A, K, S} end Base.IteratorSize(::Type{<:UnambiguousKmers}) = Base.SizeUnknown() +function Base.eltype(::Type{<:UnambiguousKmers{A, K}}) where {A, K} + Tuple{derive_type(Kmer{A, K}), Int} +end + source_type(::Type{UnambiguousKmers{A, K, S}}) where {A, K, S} = S # Constructors @@ -45,7 +49,8 @@ const UnambiguousDNAMers{K, S} = UnambiguousKmers{DNAAlphabet{2}, K, S} const UnambiguousRNAMers{K, S} = UnambiguousKmers{RNAAlphabet{2}, K, S} @inline function Base.iterate(it::UnambiguousKmers{A, K, S}) where {A, K, S} - state = (eltype(it)(unsafe, zero_tuple(eltype(it))), K, 1) + T = derive_type(Kmer{A, K}) + state = (T(unsafe, zero_tuple(T)), K, 1) iterate_kmer(RecodingScheme(A(), S), it, state) end @@ -59,18 +64,19 @@ end state::Tuple{Kmer, Int, Int}, ) (kmer, remaining, index) = state + K = ksize(kmer) while !iszero(remaining) index > lastindex(it.it.seq) && return nothing - symbol = convert(eltype(kmer), it.it.seq[index]) + symbol = convert(typeof(kmer), it.it.seq[index]) index += 1 if isambiguous(symbol) - remaining = ksize(eltype(it)) + remaining = K else remaining -= 1 kmer = shift(kmer, symbol) end end - (kmer, (kmer, 1, index)) + ((kmer, index - K), (kmer, 1, index)) end # Here, we can forward directly to FwKmers @@ -80,7 +86,13 @@ end state::Tuple{Kmer, Int, Int}, ) (kmer, _, index) = state - iterate(it.it, (kmer, index)) + itval = iterate(it.it, (kmer, index)) + if itval === nothing + nothing + else + (kmer, s) = itval + ((kmer, index - ksize(kmer)), s) + end end @inline function iterate_kmer( @@ -97,7 +109,7 @@ end index += 1 encoding = @inbounds ASCII_SKIPPING_LUT[(byte + 0x01) % Int] if encoding == 0xff - throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) + throw(BioSequences.EncodeError(Alphabet(kmer), repr(byte))) elseif encoding == 0xf0 remaining = K else @@ -105,7 +117,7 @@ end kmer = shift_encoding(kmer, encoding % UInt) end end - (kmer, (kmer, 1, index)) + ((kmer, index - K), (kmer, 1, index)) end @inline function iterate_kmer( @@ -121,5 +133,5 @@ end index += 1 remaining = isone(count_ones(encoding)) ? remaining - 1 : K end - (kmer, (kmer, 1, index)) + ((kmer, index - K), (kmer, 1, index)) end diff --git a/src/iterators/common.jl b/src/iterators/common.jl index f975318..cb9e992 100644 --- a/src/iterators/common.jl +++ b/src/iterators/common.jl @@ -26,13 +26,10 @@ Get the type of the data source that kmers are extracted from """ function source_type end -function used_source(R::RecodingScheme, s) - if R isa AsciiEncode && s isa Union{String, SubString{String}} - codeunits(s) - else - s - end +function used_source(::AsciiEncode, s::AbstractString) + is_ascii(typeof(s)) ? codeunits(s) : s end +used_source(::RecodingScheme, s) = s @noinline throw_bad_byte_error(b::UInt8) = error("Cannot interpret byte $(repr(b)) as nucleotide") diff --git a/test/runtests.jl b/test/runtests.jl index 07c0bd4..c55b703 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -568,18 +568,9 @@ end end end -# include("construction_and_conversion.jl") -# include("comparisons.jl") -# include("length.jl") -# include("access.jl") -# include("random.jl") -# include("find.jl") -# include("print.jl") -# include("transformations.jl") -# include("mismatches.jl") -# include("debruijn_neighbors.jl") -# include("iteration.jl") -# include("translation.jl") -#include("shuffle.jl") +# FwRvIterator +# Canonical +# UnambiguousKmers +# SpacedKmers end # module From 825bc73d8ff95deaa06998fbd3820910b9a9d38e Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 2 Jan 2024 20:17:03 +0100 Subject: [PATCH 30/33] Typos and fixes --- docs/src/faq.md | 2 +- docs/src/hashing.md | 2 +- docs/src/index.md | 4 ++-- docs/src/iteration.md | 4 ++-- docs/src/kmers.md | 6 +++--- docs/src/translation.md | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/src/faq.md b/docs/src/faq.md index fd2590e..56684ec 100644 --- a/docs/src/faq.md +++ b/docs/src/faq.md @@ -20,7 +20,7 @@ In fact, this is implemented by a manually thrown `MethodError`; the generic cas The reason for this is the consequence of the following limitations: * `isequal(x, y)` implies `hash(x) == hash(y)` -* `isqual(x, y)` and `x == y` ought to be identical for well-defined elements (i.e. in the absence of `missing`s and `NaN`s etc.) +* `isequal(x, y)` and `x == y` ought to be identical for well-defined elements (i.e. in the absence of `missing`s and `NaN`s etc.) * `hash(::Kmer)` must be absolutely maximally efficient If kmers were to be comparable to `BioSequence`, then the hashing of `BioSequence` should follow `Kmer`, which practically speaking would mean that all biosequences would need to be recoded to `Kmer`s before hashing. diff --git a/docs/src/hashing.md b/docs/src/hashing.md index 9f71980..d6ebfe6 100644 --- a/docs/src/hashing.md +++ b/docs/src/hashing.md @@ -13,7 +13,7 @@ end or Kmers.jl ## Hashing -Kmers implement `Base.hash`, yielding a `UInt` value: +Kmers.jl implements `Base.hash`, yielding a `UInt` value: ```jldoctest; filter = r"^0x[0-9a-fA-F]+$" julia> hash(mer"UGCUGUAC"r) diff --git a/docs/src/index.md b/docs/src/index.md index 455f155..58eb539 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,5 +1,5 @@ ## Kmers.jl -Kmers.jl provide the `Kmer <: BioSequence` type which implement the concept of a +Kmers.jl provides the `Kmer <: BioSequence` type which implement the concept of a [k-mer](https://en.wikipedia.org/wiki/K-mer), a biological sequence of exactly length `k`. @@ -7,7 +7,7 @@ K-mers are used frequently in bioinformatics because, when k is small and known compile time, these sequences can be efficiently represented as integers and stored directly in CPU registers, allowing for much more efficient computation than arbitrary-length sequences. -In Kmers.jl, the `Kmer` type is psrameterized by its length, and its data is stored in an `NTuple`. This makes `Kmers` bitstypes and highly efficient. +In Kmers.jl, the `Kmer` type is parameterized by its length, and its data is stored in an `NTuple`. This makes `Kmers` bitstypes and highly efficient. Conceptually, one may use the following analogy: * `BioSequence` is like `AbstractVector` diff --git a/docs/src/iteration.md b/docs/src/iteration.md index 543d017..0dcd53f 100644 --- a/docs/src/iteration.md +++ b/docs/src/iteration.md @@ -8,10 +8,10 @@ end ``` ## Iteration Most applications of kmers extract multiple kmers from an underlying sequence. -To facilitate this, Kmers.jl implements a few various basic kmer iterators which are all subtypes of `AbstractKmerIterator`. +To facilitate this, Kmers.jl implements a few basic kmer iterators which are all subtypes of `AbstractKmerIterator`. The underlying sequence can be a `BioSequence`, `AbstractString`, or `AbstractVector{UInt8}`. -In the latter case, if the alphabet of the element type implements `BioSequences.AsciiAlphabet`, the vector will be treated a vector of ASCII characters. +In the latter case, if the alphabet of the element type implements `BioSequences.AsciiAlphabet`, the vector will be treated as a vector of ASCII characters. Similarly to the rules when constructing kmers directly, DNA and RNA is treated interchangeably when the underlying sequence is a `BioSequence`, but when the underlying sequence is a string or bytevector, `U` and `T` are considered different, and e.g. uracil cannot be constructed from a sequence containing `T`: diff --git a/docs/src/kmers.md b/docs/src/kmers.md index f78d2f3..833de75 100644 --- a/docs/src/kmers.md +++ b/docs/src/kmers.md @@ -26,8 +26,8 @@ struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} end ``` Where: -* `A` is the `Alphabet` as defined in BioSequences.jl -* `K` is the length +* `A` is the `Alphabet` as defined in BioSequences.jl. +* `K` is the length. * `N` is an extra type parameter derived from the first two, which exists only because Julia does not allow computed type parameters. @@ -154,7 +154,7 @@ KD ### A note on type stability !!! warning Except scalar indexing which always returns a single symbol, all the operations - above a _type unstable_, since the length (and thus type) of the resulting + above are _type unstable_, since the length (and thus type) of the resulting kmer depends on the input value, not its type. However, type unstable functions may be type-stable, if the indexing value is diff --git a/docs/src/translation.md b/docs/src/translation.md index d9cefd9..9fe222b 100644 --- a/docs/src/translation.md +++ b/docs/src/translation.md @@ -17,7 +17,7 @@ CLI ``` Since `Kmer`s are immutable, the in-place `translate!` function is not implemented for `Kmers`. -Also, remember that `Kmer`s are only efficient when short (at most a few hundred symbols). Hence, entire exons or genes should probably be represented by `LongSequence` or `LongSubSeq`. +Also, remember that `Kmer`s are only efficient when short (at most a few hundred symbols). Hence, entire exons or genes should probably not ever be represented by a `Kmer`, but rather as a `LongSequence` or `LongSubSeq` from BioSequences.jl. ### Reverse translation Kmers.jl implements reverse translation, in which an amino acid sequence is translated to an RNA sequence. From 8801e32751b4e2459662e0dd3eb3786d6400f2c1 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 2 Jan 2024 20:42:53 +0100 Subject: [PATCH 31/33] Make minhash example smaller --- docs/src/minhash.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/src/minhash.md b/docs/src/minhash.md index 8b9d127..89bc605 100644 --- a/docs/src/minhash.md +++ b/docs/src/minhash.md @@ -11,14 +11,15 @@ end ## MinHash The MinHash algorithm is used in tools such as [Mash](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x) and [sourmash](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6720031/) to quickly compute approximate similarities of genomes, collections of genomes, or collections of reads. -```jldoctest; filter = r"^\d\d\d? MB/s$" => s"***" +```jldoctest; filter = r"^\d+ MB/s$" => s"***" using BioSequences, MinHash, FASTX, Kmers -# Write 100 MB of DNA in 50 genomes to buffer +# Write 25 sequences of length 20 to a buffer. +# Try changing this to length 4 million! buffer = IOBuffer() writer = FASTAWriter(buffer) -n_bytes = sum(1:50) do genome - rec = FASTARecord("seq_$(genome)", randdnaseq(2_000_000)) +n_bytes = sum(1:25) do genome + rec = FASTARecord("seq_$(genome)", randdnaseq(20)) write(writer, rec) end flush(writer) From 2e7553fdfad2a2f0381dc81ba3083f47bc07750e Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 2 Jan 2024 21:02:42 +0100 Subject: [PATCH 32/33] Optimise slicing --- src/indexing.jl | 25 ++++++++++++++++--------- src/iterators/UnambiguousKmers.jl | 10 +++++----- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/indexing.jl b/src/indexing.jl index 99ca6ac..e2de96d 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -10,16 +10,23 @@ end # This is usually type unstable, but in user code, users may use constant-folded ranges, # e.g. f(x) = x[2:4]. In this case, we need it to compile to very efficient code. # Hence, it MUST use @inline -@inline function Base.getindex(kmer::Kmer{A}, range::AbstractRange{<:Integer}) where {A} +@inline function Base.getindex(kmer::Kmer{A}, range::AbstractUnitRange{<:Integer}) where {A} @boundscheck checkbounds(kmer, range) - T = derive_type(Kmer{A, length(range)}) - data = zero_tuple(T) - nbits = BioSequences.bits_per_symbol(A()) - for i in range - (_, data) = - leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) + K = length(range) + iszero(K) && return Kmer{A, 0, 0}(unsafe, ()) + (i1, _) = BioSequences.bitindex(kmer, first(range)) + (i2, o2) = BioSequences.bitindex(kmer, last(range)) + data = kmer.data[i1:i2] + (_, data) = rightshift_carry(data, o2, zero(UInt)) + T = derive_type(Kmer{A, K}) + N = nsize(T) + # After the shift, the first coding element may be unused + new_data = if N > length(data) + tail(data) + else + data end - T(unsafe, data) + T(unsafe, new_data) end # Same as above: This needs to be able to inline if the indices are known statically @@ -52,7 +59,7 @@ function Base.getindex(kmer::Kmer{A}, indices::AbstractVector{<:Integer}) where T(unsafe, data) end -@inline function BioSequences.bitindex(kmer::Kmer, i::Unsigned)::Tuple{UInt, UInt} +@inline function BioSequences.bitindex(kmer::Kmer, i::Integer)::Tuple{UInt, UInt} bps = BioSequences.bits_per_symbol(kmer) % UInt bpe = (8 * sizeof(UInt)) % UInt (i, o) = divrem((UInt(i) - UInt(1) + n_unused(typeof(kmer))) * bps, bpe) diff --git a/src/iterators/UnambiguousKmers.jl b/src/iterators/UnambiguousKmers.jl index 7c74f82..4ee67aa 100644 --- a/src/iterators/UnambiguousKmers.jl +++ b/src/iterators/UnambiguousKmers.jl @@ -13,14 +13,14 @@ and `UnambiguousRNAMers{K}(s)`. by `UnambiguousKmers`. # Examples: -``` +```jldoctest julia> it = UnambiguousRNAMers{4}(dna"TGAGCWKCATC"); julia> collect(it) -3-element Vector{Kmer{RNAAlphabet{2}, 4, 1}}: - UGAG - GAGC - CAUC +3-element Vector{Tuple{Kmer{RNAAlphabet{2}, 4, 1}, Int64}}: + (UGAG, 1) + (GAGC, 2) + (CAUC, 8) ``` """ struct UnambiguousKmers{A <: TwoBit, K, S} From 2dd7acb4a080f9a66ae51dbd2cf015a3d2793369 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Tue, 2 Jan 2024 21:06:09 +0100 Subject: [PATCH 33/33] Fix bug in slicing --- src/indexing.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/indexing.jl b/src/indexing.jl index e2de96d..9a406e6 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -21,12 +21,12 @@ end T = derive_type(Kmer{A, K}) N = nsize(T) # After the shift, the first coding element may be unused - new_data = if N > length(data) + new_data = if N < length(data) tail(data) else data end - T(unsafe, new_data) + T(unsafe, (first(new_data) & get_mask(T), tail(new_data)...)) end # Same as above: This needs to be able to inline if the indices are known statically