diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..903da37 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,8 @@ +always_for_in = true +whitespace_typedefs = true +whitespace_ops_in_indices = true +remove_extra_newlines = true +import_to_using = true +normalize_line_endings = "unix" +separate_kwargs_with_semicolon = true +whitespace_in_kwargs = false diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml index 35d4097..c553475 100644 --- a/.github/workflows/Documentation.yml +++ b/.github/workflows/Documentation.yml @@ -21,6 +21,5 @@ jobs: run: julia --color=yes --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' - name: Build and deploy env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key run: julia --color=yes --project=docs/ docs/make.jl diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index cff1346..71a40b6 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -42,3 +42,21 @@ jobs: name: codecov-umbrella fail_ci_if_error: false token: ${{ secrets.CODECOV_TOKEN }} + + docs: + name: Documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@latest + with: + version: '1' + - run: | + julia --project=docs -e ' + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate()' + - run: julia --project=docs docs/make.jl + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.gitignore b/.gitignore index 02e1207..37956a9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ *.jl.*.cov *.jl.mem .DS_Store -Manifest.toml \ No newline at end of file +Manifest.toml +TODO.md +docs/build diff --git a/Project.toml b/Project.toml index 28dacf5..4897116 100644 --- a/Project.toml +++ b/Project.toml @@ -1,17 +1,31 @@ name = "Kmers" uuid = "445028e4-d31f-4f27-89ad-17affd83fc22" -authors = ["Sabrina Jaye Ward "] -version = "0.1.0" +authors = [ + "Jakob Nybo Nissen ", + "Sabrina Jaye Ward " +] +version = "1.0.0" + +[weakdeps] +StringViews = "354b36f9-a18e-4713-926e-db85100087ba" [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" +StringViews = "354b36f9-a18e-4713-926e-db85100087ba" + +[extensions] +StringViewsExt = "StringViews" [compat] BioSequences = "3.1.3" -julia = "1.5" +Random = "1.10" +julia = "1.8" +StringViews = "1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [targets] -test = ["Test"] +test = ["Test", "Random"] diff --git a/README.md b/README.md index ed51eff..2dd7595 100644 --- a/README.md +++ b/README.md @@ -3,57 +3,57 @@ [![Latest Release](https://img.shields.io/github/release/BioJulia/Kmers.jl.svg)](https://github.com/BioJulia/Kmers.jl/releases/latest) [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE) [![Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/Kmers.jl/stable) -[![Pkg Status](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) - ## Description +Kmers.jl provide the `Kmer <: BioSequence` type which implement the concept of a +[k-mer](https://en.wikipedia.org/wiki/K-mer), a biological sequence of exactly length `k`. -Kmers provides a specialised concrete `BioSequence` subtype, optimised for -representing short immutable sequences called kmers: contiguous sub-strings of k -nucleotides of some reference sequence. -They are used extensively in bioinformatic analyses as an informational unit. -This concept was popularised by short read assemblers. -Analyses within the kmer space benefit from a simple formulation of the sampling -problem and direct in-hash comparisons. +K-mers are used frequently in bioinformatics because, when k is small and known at +compile time, these sequences can be efficiently represented as integers and stored +directly in CPU registers, allowing for much more efficient computation than arbitrary-length sequences. -Kmers provides the type representing kmers as well as the implementations of -the APIs specified by the -[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package. +In Kmers.jl, the `Kmer` type is psrameterized by its length, and its data is stored in an `NTuple`. This makes `Kmers` bitstypes and highly efficient. -## Installation +Conceptually, one may use the following analogy: +* `BioSequence` is like `AbstractVector` +* `LongSequence` is like `Vector` +* `Kmer` is like [`SVector`](https://github.com/JuliaArrays/StaticArrays.jl) from `StaticArrays` + +Kmers.jl is tightly coupled to the +[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package, +and relies on its internals. +Hence, you should expect strict compat bounds on BioSequences.jl. + +## Usage +### ⚠️ WARNING ⚠️ +`Kmer`s are parameterized by their length. That means any operation on `Kmer`s that change their length, such as `push`, `pop`, slicing, or masking (logical indexing) will be **type unstable** and hence slow and memory inefficient, unless you write your code in such as way that the compiler can use constant folding. +Further, as `Kmer`s are immutable and their operations are aggressively inlined and unrolled, +they become inefficent as they get longer. +For example, reverse-complementing a 32-mer takes 26 ns, compared to 102 ns for the equivalent `LongSequence`. However, for 512-mers, the `LongSequence` takes 126 ns, and the `Kmer` 16 μs! + +Kmers.jl is intended for high-performance computing. If you do not need the extra performance that register-stored sequences provide, you might consider using `LongSequence` from BioSequences.jl instead + +## Installation You can install BioSequences from the julia REPL. Press `]` to enter pkg mode, and enter the following: ```julia -add Kmers +pkg> add Kmers ``` -If you are interested in the cutting edge of the development, please check out +If you are interested in the cutting edge of development, please check out the master branch to try new features before release. - -## Testing - -Kmers is tested against Julia `1.X` on Linux, OS X, and Windows. - -[![Unit tests](https://github.com/BioJulia/Kmers.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/Kmers.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster) -[![Documentation](https://github.com/BioJulia/Kmers.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/BioKmers.jl/actions?query=workflow%3ADocumentation+branch%3Amaster) -[![](https://codecov.io/gh/BioJulia/Kmers.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/Kmers.jl) - - ## Contributing - We appreciate contributions from users including reporting bugs, fixing issues, improving performance and adding new features. Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct. - ## Questions? - If you have a question about contributing or using BioJulia software, come -on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the +on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/docs/Project.toml b/docs/Project.toml index e064fd1..2d21498 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,9 @@ [deps] +BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12" +Kmers = "445028e4-d31f-4f27-89ad-17affd83fc22" +MinHash = "4b3c9753-2685-44e9-8a29-365b96c023ed" [compat] -Documenter = "0.24" \ No newline at end of file +Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl index 61b4844..e0b4f59 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,29 +1,32 @@ using Documenter, Kmers -makedocs( - format = Documenter.HTML(), - sitename = "Kmers.jl", - pages = [ - "Home" => "index.md", - "Kmer types" => "kmer_types.md", - "Constructing kmers" => "construction.md", - "Indexing & modifying kmers" => "transforms.md", - "Predicates" => "predicates.md", - "Random kmers" => "random.md", - "Iterating over Kmers" => "iteration.md", - "Translation" => "translate.md", - #"Pattern matching and searching" => "sequence_search.md", - #"Iteration" => "iteration.md", - #"Counting" => "counting.md", - #"I/O" => "io.md", - #"Interfaces" => "interfaces.md" +DocMeta.setdocmeta!( + Kmers, + :DocTestSetup, + :(using BioSequences, Kmers, Test); + recursive=true, +) + +makedocs(; + modules=[Kmers], + format=Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"), + sitename="Kmers.jl", + pages=[ + "Home" => "index.md", + "The Kmer type" => "kmers.md", + "Iteration" => "iteration.md", + "Translation" => "translation.md", + "Hashing" => "hashing.md", + "FAQ" => "faq.md", + "Cookbook" => ["MinHash" => "minhash.md", "Kmer composition" => "composition.md"], ], - authors = "Ben J. Ward, The BioJulia Organisation and other contributors." + authors="Jakob Nybo Nissen, Sabrina J. Ward, The BioJulia Organisation and other contributors.", + checkdocs=:exports, ) -deploydocs( - repo = "github.com/BioJulia/Kmers.jl.git", - push_preview = true, - deps = nothing, - make = nothing +deploydocs(; + repo="github.com/BioJulia/Kmers.jl.git", + push_preview=true, + deps=nothing, + make=nothing, ) diff --git a/docs/src/composition.md b/docs/src/composition.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/src/construction.md b/docs/src/construction.md deleted file mode 100644 index 6b300a9..0000000 --- a/docs/src/construction.md +++ /dev/null @@ -1,12 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Construction & conversion - -```@docs -Kmer{A,K,N}(itr) -``` diff --git a/docs/src/faq.md b/docs/src/faq.md new file mode 100644 index 0000000..56684ec --- /dev/null +++ b/docs/src/faq.md @@ -0,0 +1,39 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` +## FAQ +### Why can kmers not be compared to biosequences? +It may be surprising that kmers cannot be compared to other biosequences: + +```jldoctest +julia> dna"TAG" == mer"TAG"d +ERROR: MethodError +[...] +``` + +In fact, this is implemented by a manually thrown `MethodError`; the generic case `Base.:==(::BioSequence, ::BioSequence)` is defined. + +The reason for this is the consequence of the following limitations: +* `isequal(x, y)` implies `hash(x) == hash(y)` +* `isequal(x, y)` and `x == y` ought to be identical for well-defined elements (i.e. in the absence of `missing`s and `NaN`s etc.) +* `hash(::Kmer)` must be absolutely maximally efficient + +If kmers were to be comparable to `BioSequence`, then the hashing of `BioSequence` should follow `Kmer`, which practically speaking would mean that all biosequences would need to be recoded to `Kmer`s before hashing. + +### Why isn't there an iterator of unambiguous, canonical kmers or spaced, canonical kmers? +Any iterator of nucleotide kmers can be made into a canonical kmer iterator by simply calling `canonical` on its output kers. + +The `CanonicalKmers` iterator is special cased, because with a step size of 1, it is generally faster to build the next kmer by storing both the reverse and forward kmer, then creating the next kmer by prepending/append the next symbol. + +However, with a larger step size, it becomes more efficient to build the forward kmer, then reverse-complement the whole kmer. + +### Why isn't there an iterator of skipmers/minimizers/k-min-mers, etc? +The concept of kmers have turned out to be remarkably flexible and useful in bioinformatics, and have spawned a neverending stream of variations. + +We simply can't implement them all. +However, we hope to make it relatively easy to implement custom kmer iterators for downstream users. \ No newline at end of file diff --git a/docs/src/hashing.md b/docs/src/hashing.md new file mode 100644 index 0000000..d6ebfe6 --- /dev/null +++ b/docs/src/hashing.md @@ -0,0 +1,59 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` + +!!! warning + The value of hashes are guaranteed to be reproducible for a given version + of Kmers.jl and Julia, but may __change__ in new minor versions of Julia + or Kmers.jl + +## Hashing +Kmers.jl implements `Base.hash`, yielding a `UInt` value: + +```jldoctest; filter = r"^0x[0-9a-fA-F]+$" +julia> hash(mer"UGCUGUAC"r) +0xe5057d38c8907b22 +``` + +The implementation of `Base.hash` for kmers strikes a compromise between providing a high-quality (non-cryptographic) hash, while being reasonably fast. +While hash collisions can easily be found, they are unlikely to occur at random. +When kmers are of the same (or compatible) alphabets, different kmers hash to different values, even when they have the same underlying bitpattern: + +```jldoctest +julia> using BioSequences: encoded_data + +julia> a = mer"TAG"d; b = mer"AAAAAAATAG"d; + +julia> encoded_data(a) === encoded_data(b) +true + +julia> hash(a) == hash(b) +false +``` + +When they are of compatible alphabets, and have the same content, they hash to the same value. +Currently, only DNA and RNA of the alphabets `DNAAlphabet` and `RNAAlphabet` are compatible: + +```jldoctest +julia> a = mer"UUGU"r; b = mer"TTGT"d; + +julia> a == b # equal +true + +julia> a === b # not egal +false + +julia> hash(a) === hash(b) +true +``` + +For some applications, fast hashing is absolutely crucial. For these cases, Kmers.jl provides [`fx_hash`](@ref), which trades off hash quality for speed: + +```@docs +fx_hash +``` \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 11c4fe9..58eb539 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,59 +1,49 @@ -# Kmers +## Kmers.jl +Kmers.jl provides the `Kmer <: BioSequence` type which implement the concept of a +[k-mer](https://en.wikipedia.org/wiki/K-mer), a biological sequence of exactly length `k`. -[![Latest Release](https://img.shields.io/github/release/BioJulia/Kmers.jl.svg)](https://github.com/BioJulia/Kmers.jl/releases/latest) -[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE) -[![Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/Kmers.jl/stable) -[![Pkg Status](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active) +K-mers are used frequently in bioinformatics because, when k is small and known at +compile time, these sequences can be efficiently represented as integers and stored +directly in CPU registers, allowing for much more efficient computation than arbitrary-length sequences. -## Description +In Kmers.jl, the `Kmer` type is parameterized by its length, and its data is stored in an `NTuple`. This makes `Kmers` bitstypes and highly efficient. -Kmers provides a specialised concrete `BioSequence` subtype, optimised for -representing short immutable sequences called kmers: contiguous sub-strings of k -nucleotides of some reference sequence. +Conceptually, one may use the following analogy: +* `BioSequence` is like `AbstractVector` +* `LongSequence` is like `Vector` +* `Kmer` is like [`SVector`](https://github.com/JuliaArrays/StaticArrays.jl) from `StaticArrays` -They are used extensively in bioinformatic analyses as an informational unit. -This concept was popularised by short read assemblers. -Analyses within the kmer space benefit from a simple formulation of the sampling -problem and direct in-hash comparisons. +Kmers.jl is tightly coupled to the +[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package, +and relies on its internals. +Hence, you should expect strict compat bounds on BioSequences.jl. -Kmers provides the type representing kmers as well as the implementations of -the APIs specified by the -[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package. +## Usage +### ⚠️ WARNING ⚠️ +`Kmer`s are parameterized by their length. That means any operation on `Kmer`s that change their length, such as `push`, `pop`, slicing, or masking (logical indexing) will be **type unstable** and hence slow and memory inefficient, unless you write your code in such as way that the compiler can use constant folding. -## Installation +Kmers.jl is intended for high-performance computing. If you do not need the extra performance that register-stored sequences provide, you might consider using `LongSequence` from BioSequences.jl instead +## Installation You can install BioSequences from the julia REPL. Press `]` to enter pkg mode, and enter the following: ```julia -add Kmers +pkg> add Kmers ``` -If you are interested in the cutting edge of the development, please check out +If you are interested in the cutting edge of development, please check out the master branch to try new features before release. - -## Testing - -Kmers is tested against Julia `1.X` on Linux, OS X, and Windows. - -[![Unit tests](https://github.com/BioJulia/Kmers.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/Kmers.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster) -[![Documentation](https://github.com/BioJulia/Kmers.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/BioKmers.jl/actions?query=workflow%3ADocumentation+branch%3Amaster) -[![](https://codecov.io/gh/BioJulia/Kmers.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/Kmers.jl) - - ## Contributing - We appreciate contributions from users including reporting bugs, fixing issues, improving performance and adding new features. Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct. - ## Questions? - If you have a question about contributing or using BioJulia software, come -on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the +on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/docs/src/iteration.md b/docs/src/iteration.md index 3b36181..0dcd53f 100644 --- a/docs/src/iteration.md +++ b/docs/src/iteration.md @@ -1,31 +1,105 @@ ```@meta CurrentModule = Kmers DocTestSetup = quote + using BioSequences + using Test using Kmers end ``` +## Iteration +Most applications of kmers extract multiple kmers from an underlying sequence. +To facilitate this, Kmers.jl implements a few basic kmer iterators which are all subtypes of `AbstractKmerIterator`. -# Iterating over kmers +The underlying sequence can be a `BioSequence`, `AbstractString`, or `AbstractVector{UInt8}`. +In the latter case, if the alphabet of the element type implements `BioSequences.AsciiAlphabet`, the vector will be treated as a vector of ASCII characters. -When introducing the `Kmer` type we described kmers as contiguous sub-strings of -k nucleotides of some reference sequence. +Similarly to the rules when constructing kmers directly, DNA and RNA is treated interchangeably when the underlying sequence is a `BioSequence`, but when the underlying sequence is a string or bytevector, `U` and `T` are considered different, and e.g. uracil cannot be constructed from a sequence containing `T`: -This package therefore contains functionality for iterating over all the valid -`Kmers{A,K,N}` in a longer `BioSequence`. +```jldoctest +julia> only(FwDNAMers{3}(rna"UGU")) +DNA 3-mer: +TGT + +julia> only(FwDNAMers{3}("UGU")) +ERROR: +[...] +``` + +The following kmer iterators are implemented: + +### `FwKmers` +The most basic kmer iterator is `FwKmers`, which simply iterates every kmer, in order: + +```@docs +FwKmers +FwDNAMers +FwRNAMers +FwAAMers +``` + +### `FwRvIterator` +This iterates over a nucleic acid sequence. For every kmer it encounters, it outputs the kmer and its reverse complement. + +```@docs +FwRvIterator +``` + +### `CanonicalKmers` +This iterator is similar to [`FwKmers`](@ref), however, for each `Kmer` encountered, it returns the _canonical_ kmer. + +The canonical kmer is defined as the lexographically smaller of a kmer and its reverse complement. +That is, if [`FwKmers`](@ref) would iterate `TCAC`, then [`CanonicalKmers`](@ref) would return `GTGA`, as this is the reverse complement of `TCAC`, and is before `TCAC` in the alphabet. + +[`CanonicalKmers`](@ref) is useful for summarizing the kmer composition of sequences whose strandedness is unknown. + +```@docs +CanonicalKmers +CanonicalDNAMers +CanonicalRNAMers +``` + +### `UnambiguousKmers` +[`UnambiguousKmers`](@ref) iterates unambiguous nucleotides (that is, kmers of the alphabets `DNAAlphabet{2}` or `RNAAlphabet{2}`). +Any kmers containing [ambiguous nucleotides](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC341218/) such as `W` or `N` are skipped. + +```@docs +UnambiguousKmers +UnambiguousDNAMers +UnambiguousRNAMers +``` + +### `SpacedKmers` +The [`SpacedKmers`](@ref) iterator iterates kmers with a fixed step size between k-mers. +For example, for a K of 4, and a step size of 3, the output kmers would overlap with a single nucleotide, like so: + +``` +seq: TGATGCGTAGTG + TGCT + TGCG + GTAG +``` + +Hence, if `FwKmers` are analogous to `UnitRange`, `SpacedKmers` is analogous to `StepRange`. ```@docs -EveryKmer -EveryKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} -EveryKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} SpacedKmers -SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} -SpacedKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} -EveryCanonicalKmer -EveryCanonicalKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} -EveryCanonicalKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} -SpacedCanonicalKmers -SpacedCanonicalKmers{T}(seq::S, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} -SpacedCanonicalKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} +SpacedDNAMers +SpacedRNAMers +SpacedAAMers +``` + +The convenience functions [`each_dna_codon`](@ref) and [`each_rna_codon`](@ref) return `SpacedKmers` with a K value of 3 and step size of 3: + +```@docs +each_dna_codon +each_rna_codon ``` +## The `AbstractKmerIterator` interface +It's very likely that users of Kmers.jl need to implement their own custom kmer iterators, in which case they should subtype [`AbstractKmerIterator`](@ref). + +```@docs +AbstractKmerIterator +``` +There is no real interface implemented for this abstract type, other than that `AbstractKmerIterator{A, K}` needs to iterate `Kmer{A, K}`. diff --git a/docs/src/kmer_types.md b/docs/src/kmer_types.md deleted file mode 100644 index aab9330..0000000 --- a/docs/src/kmer_types.md +++ /dev/null @@ -1,92 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Kmer types - -Bioinformatic analyses make extensive use of kmers. -Kmers are contiguous sub-strings of k nucleotides of some reference sequence. - -They are used extensively in bioinformatic analyses as an informational unit. -This concept popularised by short read assemblers. -Analyses within the kmer space benefit from a simple formulation of the sampling -problem and direct in-hash comparisons. - -BioSequences provides the following types to represent Kmers. - -```@docs -Kmer -``` - -The following aliases are also defined: - -```@docs -DNAKmer -DNA27mer -DNA31mer -DNA63mer -RNAKmer -RNA27mer -RNA31mer -RNA63mer -AAKmer -DNACodon -RNACodon -``` - - -### Skipmers - -For some analyses, the contiguous nature of kmers imposes limitations. -A single base difference, due to real biological variation or a sequencing error, -affects all k-mers crossing that position thus impeding direct analyses by identity. -Also, given the strong interdependence of local sequence, contiguous sections -capture less information about genome structure, and so they are more affected by -sequence repetition. - -Skipmers are a generalisation of the concept of a kmer. -They are created using a cyclic pattern of used-and-skipped positions which -achieves increased entropy and tolerance to nucleotide substitution differences -by following some simple rules. - -Skipmers preserve many of the elegant properties of kmers such as reverse -complementability and existence of a canonical representation. -Also, using cycles of three greatly increases the power of direct intersection -between the genomes of different organisms by grouping together the more conserved -nucleotides of protein-coding regions. - -BioSequences currently does not provide a separate type for skipmers, they are -represented using `Mer` and `BigMer` as their representation as a short immutable -sequence encoded in an unsigned integer is the same. -The distinction lies in how they are generated. - -#### Skipmer generation - -A skipmer is a simple cyclic q-gram that includes _m_ out of every _n_ bases -until a total of _k_ bases is reached. - -This is illustrated in the figure below (from -[this paper](https://www.biorxiv.org/content/biorxiv/early/2017/08/23/179960.full.pdf).): - -![skipmer-fig](skipmers.png) - -To maintain cyclic properties and the existence of the reverse-complement as a -skipmer defined by the same function, _k_ should be a multiple of _m_. - -This also enables the existence of a canonical representation for each skipmer, -defined as the lexicographically smaller of the forward and reverse-complement -representations. - -Defining _m_, _n_ and _k_ fixes a value for _S_, the total span of the skipmer, -given by: - -```math -S = n * (\frac{k}{m} - 1) + m -``` - -To see how to iterate over skipmers cf. kmers, see the Iteration section -of the manual. - diff --git a/docs/src/kmers.md b/docs/src/kmers.md new file mode 100644 index 0000000..833de75 --- /dev/null +++ b/docs/src/kmers.md @@ -0,0 +1,187 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` + +## The `Kmer` type +The central type of Kmers.jl is the `Kmer`. +A `Kmer` is an immutable, bitstype `BioSequence`, with a length known at compile +time. Compared to `LongSequence` in BioSequences.jl, +this gives to one advantage, and comes with two disadvantages: +* Kmers are much faster than `LongSequence`, as they can be stored in registers. +* As kmers gets longer, the code gets increasingly inefficient, as the unrolling + and inlining of the immutable operations breaks down. +* Since their length is part of their type, any operation that results in a kmer + whose length cannot be determined at compile time will be type unstable. + This includes slicing a kmer, pushing and popping it, and other operations. + +The `Kmer` type is (roughly) defined as +```julia +struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} + x::NTuple{N, UInt} +end +``` +Where: +* `A` is the `Alphabet` as defined in BioSequences.jl. +* `K` is the length. +* `N` is an extra type parameter derived from the first two, + which exists only because Julia does not allow computed type parameters. + +### Construction +Kmers can be constructed from a `BioSequence` or `AbstractString` by explicitly +specifying the length of the sequence: + +```jldoctest +julia> Kmer{DNAAlphabet{2}, 5, 1}("TAGCT") +DNA 5-mer: +TAGCT +``` + +The final type parameter can be elided, in which case it will be inferred: + +```jldoctest +julia> Kmer{DNAAlphabet{2}, 5}("TAGCT") +DNA 5-mer: +TAGCT +``` + +Kmers with alphabets `DNAAlphabet{2}`, `RNAAlphabet{2}` and `AminoAcidAlphabet` +can be created with the type aliases `DNAKmer`, `RNAKmer` and `AAKmer`: + +```jldoctest +julia> DNAKmer{3}("tag") +DNA 3-mer: +TAG + +julia> AAKmer{5}("PWYSK") +AminoAcid 5-mer: +PWYSK +``` + +For kmers with an `Alphabet` that implement `BioSequences.AsciiAlphabet`, they can also be constructed from `AbstractVector{UInt8}`, in which case the vector is interpreted as being bytes of ASCII text: + +```jldoctest +julia> AAKmer{3}([0x65, 0x67, 0x7a]) +AminoAcid 3-mer: +EGZ +``` + +When constructing from an `AbstractString` (or byte vector), uracil (`U`) and thymine `T` are treated differently - a `U` cannot be read as thymine: + +```jldoctest +julia> DNAKmer{3}("UAG") +ERROR: cannot encode 0x55 (Char 'U') in DNAAlphabet{2} +[...] +``` + +However, when constructing from a `BioSequence`, these nucleotides are considered +interchangeable: + +```jldoctest +julia> RNAKmer{4}(dna"TATC") +RNA 4-mer: +UAUC +``` + +Finally, kmers can be constructed with a string literal `@mer_str`, where the string must be appended with `d` for DNA, `r` for RNA, or `a` for amino acid: + +```jldoctest +julia> mer"UGCUGA"r +RNA 6-mer: +UGCUGA + +julia> mer"EDEHL"a +AminoAcid 5-mer: +EDEHL +``` + +Since the literals produce the kmer at parse time and inserts it directly into the parsed code, this will always be type stable, +and the overhead related to parsing the string will not be paid: + +```jldoctest; filter = [r"^\s*0\.\d+ seconds.+"s, r"^\d+"s] +julia> function count_aaas(dna) + x = 0 + for kmer in FwDNAMers{3}(dna) + # The parsing happens once here, when the + # code is parsed, and is fine to have in the loop + x += kmer == mer"AAA"d + end + x + end; + +julia> seq = randseq(DNAAlphabet{2}(), 100_000_000); + +julia> @time count_aaas(seq) + 0.193463 seconds (32.05 k allocations: 2.051 MiB, 21.88% compilation time) +1563330 +``` + + +### Indexing +Kmers support most normal indexing, such as scalar indexing: + +```jldoctest +julia> mer"CAGCU"r[3] +RNA_G +``` + +Slicing + +```jldoctest +julia> mer"AGGCTA"d[2:5] +DNA 4-mer: +GGCT +``` + +And indexing with boolean vectors, and vectors of indices: + +```jldoctest +julia> m = mer"MDGKRY"a; + +julia> m[[true, false, true, true, false, true]] +AminoAcid 4-mer: +MGKY + +julia> m[[4,2]] +AminoAcid 2-mer: +KD +``` + +### A note on type stability +!!! warning + Except scalar indexing which always returns a single symbol, all the operations + above are _type unstable_, since the length (and thus type) of the resulting + kmer depends on the input value, not its type. + +However, type unstable functions may be type-stable, if the indexing value is +known at compile time, and the Julia compiler uses constant folding: + +```jldoctest +julia> f(x) = x[2:5]; # 2:5 is a compile time constant + +julia> Test.@inferred f(mer"UCGUAGC"r) +RNA 4-mer: +CGUA +``` + +### Reference +```@docs +Kmer +Mer +@mer_str +DNAKmer +RNAKmer +AAKmer +DNACodon +RNACodon +pop +pop_first +push +push_first +shift +shift_first +``` \ No newline at end of file diff --git a/docs/src/minhash.md b/docs/src/minhash.md new file mode 100644 index 0000000..89bc605 --- /dev/null +++ b/docs/src/minhash.md @@ -0,0 +1,39 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers + using FASTX + using MinHash +end +``` +## MinHash +The MinHash algorithm is used in tools such as [Mash](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x) and [sourmash](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6720031/) to quickly compute approximate similarities of genomes, collections of genomes, or collections of reads. + +```jldoctest; filter = r"^\d+ MB/s$" => s"***" +using BioSequences, MinHash, FASTX, Kmers + +# Write 25 sequences of length 20 to a buffer. +# Try changing this to length 4 million! +buffer = IOBuffer() +writer = FASTAWriter(buffer) +n_bytes = sum(1:25) do genome + rec = FASTARecord("seq_$(genome)", randdnaseq(20)) + write(writer, rec) +end +flush(writer) + +# Time minhashing the 50 genomes +timing = @timed FASTAReader(seekstart(buffer); copy=false) do reader + map(reader) do record + seq = codeunits(sequence(record)) + minhash(fx_hash, CanonicalDNAMers{16}(sequence(record)), 1000) + end +end +println(round(Int, n_bytes / (timing.time * 1e6)), " MB/s") + +# output + +200 MB/s +``` diff --git a/docs/src/predicates.md b/docs/src/predicates.md deleted file mode 100644 index 2aebc49..0000000 --- a/docs/src/predicates.md +++ /dev/null @@ -1,18 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Predicates - -The following predicate functions from BioSequences.jl are compatible with `Kmer`s. -Some have an optimised method defined in Kmers.jl. - -```@docs -isrepetitive -ispalindromic -hasambiguity -iscanonical -``` \ No newline at end of file diff --git a/docs/src/random.md b/docs/src/random.md deleted file mode 100644 index 8a1990b..0000000 --- a/docs/src/random.md +++ /dev/null @@ -1,14 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Generating random sequences - -You can generate random kmers using `Base.rand` function. - -```@docs -Base.rand(::Type{<:Kmer}) -``` \ No newline at end of file diff --git a/docs/src/transforms.md b/docs/src/transforms.md deleted file mode 100644 index 853b10c..0000000 --- a/docs/src/transforms.md +++ /dev/null @@ -1,52 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Indexing & modifying kmers - -## Indexing - -As `BioSequence` concrete subtypes, kmers can be indexed using integers - -```jldoctest -julia> kmer = Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C) -DNA 5-mer: -TTAGC - -julia> kmer[3] -DNA_A -``` - -You can also slice Kmers using UnitRanges: - -```jldoctest -julia> kmer = Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C) -DNA 5-mer: -TTAGC - -julia> kmer[1:3] -DNA 3-mer: -TTA -``` - -!!! warning - Using slicing will introduce performance penalties in your code if - you pass values of `i` that are not constants that can be propagated. - -## Modifying sequences - -Many modifying operations that are possible for some `BioSequences` such as -`LongSequence` are not possible for `Kmer`s, this is primarily due to the fact -`Kmer`s are an immutable struct. - -However some non-mutating transformations are available: - -```@docs -BioSequences.complement(::Kmer) -Base.reverse(::Kmer) -BioSequences.reverse_complement(::Kmer) -canonical -``` \ No newline at end of file diff --git a/docs/src/translate.md b/docs/src/translate.md deleted file mode 100644 index d518556..0000000 --- a/docs/src/translate.md +++ /dev/null @@ -1,62 +0,0 @@ -```@meta -CurrentModule = Kmers -DocTestSetup = quote - using Kmers -end -``` - -# Translating and reverse translating - -## Translating -Just like other `BioSequence`s, `Kmer`s of RNA or DNA alphabets can be efficiently translated to amino acids: - -``` -julia> kmer = RNAKmer("AUGGGCCACUGA"); - -julia> translate(kmer) -AminoAcid 4-mer: -MGH* -``` - -For more information on translation and different genetic codes, see the documentation of BioSequences.jl. - -## Reverse translation -Reverse translation (or "revtrans", for short) refers to the mapping from amino acids back to the set of RNA codons that code for the given amino acid, under a given genetic code. -There is no known natural process of revtrans, but it can be useful to do _in silico_. - -In Kmers.jl, revtrans is done through the `reverse_translate` function. -This takes an amino acid sequence and produces a `Vector{CodonSet}`, where `CodonSet <: AbstractSet{RNACodon}`. -Alternatively, it takes an amino acid and produces a `CodonSet`. - -A reverse genetic code can optionally be specified as the second argument. -If not provided, it default to the reverse standard genetic code. - -### Example of reverse translation -```julia -julia> reverse_translate(AA_W) # default to standard genetic code -Kmers.CodonSet with 1 element: - UGG - -julia> code = ReverseGeneticCode(BioSequences.trematode_mitochondrial_genetic_code); - -julia> reverse_translate(AA_W, code) -Kmers.CodonSet with 2 elements: - UGA - UGG -``` - -### Important notes on reverse translation -* `AA_Gap` cannot be reverse translated. Attempting so throws an error -* In cells, `AA_O` and `AA_U` are encoded by dynamic overloading of the codons `UAG` and `UGA`, respectively. - Because these codons normally code for `AA_Term`, the forward genetic code returns `AA_Term` for these codons. - However, we can unambiguously reverse translate them, so these amino acids translate to codonsets with these - precise codons. -* Ambiguous amino acids translate to the union of the possible amino acids. For example, if `AA_L` translate to set `S1`, - and `AA_I` translate to `S2`, then `AA_J` translate to `union(S1, S2)`. - -```@docs -Kmers.CodonSet -Kmers.ReverseGeneticCode -reverse_translate -reverse_translate! -``` diff --git a/docs/src/translation.md b/docs/src/translation.md new file mode 100644 index 0000000..9fe222b --- /dev/null +++ b/docs/src/translation.md @@ -0,0 +1,45 @@ +```@meta +CurrentModule = Kmers +DocTestSetup = quote + using BioSequences + using Test + using Kmers +end +``` + +## Translation +`Kmer`s can be translated using the `translate` function exported by `BioSequences`: + +```jldoctest +julia> translate(mer"UGCUUGAUC"r) +AminoAcid 3-mer: +CLI +``` + +Since `Kmer`s are immutable, the in-place `translate!` function is not implemented for `Kmers`. +Also, remember that `Kmer`s are only efficient when short (at most a few hundred symbols). Hence, entire exons or genes should probably not ever be represented by a `Kmer`, but rather as a `LongSequence` or `LongSubSeq` from BioSequences.jl. + +### Reverse translation +Kmers.jl implements reverse translation, in which an amino acid sequence is translated to an RNA sequence. +While this process doesn't occur naturally (as far as we know), it is still useful for some analyses. + +Since genetic codes are degenerate, i.e. multiple codons code for the same amino acid, reverse translating a sequence does not return a nucleic acid sequence, but a vector of `CodonSet`: + +```@docs +reverse_translate +CodonSet +``` + +`CodonSet` is an efficiently implemented `AbstractSet{RNACodon}` (and remember, `RNACodon` is an alias for `RNAKmer{3, 1}`). + +To avoid allocating a new `Vector`, you can use `reverse_translate!`: + +```@docs +reverse_translate! +``` + +Both functions take a genetic code as a keyword argument of the type `ReverseGeneticCode`. This object determines the mapping from amino acid to `CodonSet` - by default the [standard genetic code](https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables#Standard_RNA_codon_table) is used - this mapping is used by nearly all organisms: + +```@docs +ReverseGeneticCode +``` \ No newline at end of file diff --git a/ext/StringViewsExt.jl b/ext/StringViewsExt.jl new file mode 100644 index 0000000..f674427 --- /dev/null +++ b/ext/StringViewsExt.jl @@ -0,0 +1,11 @@ +module StringViewsExt + +using StringViews: StringView +using Kmers: Kmers + +# This extension is important because FASTX uses string views. +# The documentation of StringViews promises that the underlying +# string is UTF-8 encoded. +Kmers.is_ascii(::Type{<:StringView}) = true + +end # module diff --git a/src/Kmers.jl b/src/Kmers.jl index be1a2c5..e098a69 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -5,13 +5,56 @@ # # This file is a part of the Kmers.jl, a package in the BioJulia ecosystem. # License is MIT: https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE +module Kmers + +export Kmer, + Mer, + DNAKmer, + RNAKmer, + AAKmer, + DNACodon, + RNACodon, + ReverseGeneticCode, + reverse_translate, + reverse_translate!, + @mer_str, + fx_hash, + + # Immutable operations + push, + push_first, + shift, + shift_first, + pop, + pop_first, -__precompile__() + # Iterators + FwKmers, + FwDNAMers, + FwRNAMers, + FwAAMers, + FwRvIterator, + CanonicalKmers, + CanonicalDNAMers, + CanonicalRNAMers, + UnambiguousKmers, + UnambiguousDNAMers, + UnambiguousRNAMers, + SpacedKmers, + SpacedDNAMers, + SpacedRNAMers, + SpacedAAMers, + each_dna_codon, + each_rna_codon, -module Kmers + # Reverse translation + CodonSet, + delete, # push already exported -export - # BioSymbols re-exports. + ################## + # Re-exports + ################## + # BioSymbols re-exports NucleicAcid, DNA, RNA, @@ -80,7 +123,7 @@ export AA_X, AA_Term, AA_Gap, - + # BioSequences re-exports Alphabet, BioSequence, @@ -89,65 +132,47 @@ export DNAAlphabet, RNAAlphabet, translate, - - - ### - ### Mers - ### + complement, + reverse_complement, + canonical, + iscanonical - # Type & aliases - Kmer, - DNAKmer, - DNA27mer, - DNA31mer, - DNA63mer, - RNAKmer, - RNA27mer, - RNA31mer, - RNA63mer, - AAKmer, - DNACodon, - RNACodon, +# Kmers.jl is tightly coupled to BioSequences and relies on much of its internals. +# Hence, we do not care about carefully importing specific symbols +using BioSequences - # Iteration - EveryKmer, - SpacedKmers, - EveryCanonicalKmer, - SpacedCanonicalKmers, - fw_neighbors, - bw_neighbors, +# This is a documented method, not internals +using Base: tail - # Immutable operators - push, - delete, +""" + Kmers.Unsafe - # Translation - reverse_translate, - reverse_translate!, - ReverseGeneticCode, - - ### - ### Sequence literals - ### - - @mer_str, - @bigmer_str +Internal trait object used to access unsafe methods of functions. +`unsafe` is the singleton of `Unsafe`. +""" +struct Unsafe end +const unsafe = Unsafe() -using BioSequences - -ispermitted(::DNAAlphabet{2}, nt::DNA) = count_ones(nt) == 1 && isvalid(nt) -ispermitted(::DNAAlphabet{2}, data::UInt) = data < UInt(4) -ispermitted(::DNAAlphabet{4}, nt::DNA) = isvalid(nt) -ispermitted(::DNAAlphabet{4}, data::UInt) = isvalid(DNA, data) -ispermitted(::AminoAcidAlphabet, aa::AminoAcid) = reinterpret(UInt8, aa) <= reinterpret(UInt8, AA_Gap) -ispermitted(::AminoAcidAlphabet, data::UInt) = data <= 0x1b +const FourBit = Union{DNAAlphabet{4}, RNAAlphabet{4}} +const TwoBit = Union{DNAAlphabet{2}, RNAAlphabet{2}} +const BitInteger = + Union{Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128} +include("tuple_bitflipping.jl") include("kmer.jl") +include("construction.jl") +include("indexing.jl") +include("transformations.jl") +include("revtrans.jl") + +include("iterators/common.jl") +include("iterators/FwKmers.jl") +include("iterators/CanonicalKmers.jl") +include("iterators/UnambiguousKmers.jl") +include("iterators/SpacedKmers.jl") -include("kmer_iteration/AbstractKmerIterator.jl") -include("kmer_iteration/EveryKmer.jl") -include("kmer_iteration/SpacedKmers.jl") -include("kmer_iteration/EveryCanonicalKmer.jl") -include("kmer_iteration/SpacedCanonicalKmers.jl") +if !isdefined(Base, :get_extension) + include("../ext/StringViewsExt.jl") +end end # module diff --git a/src/construction.jl b/src/construction.jl new file mode 100644 index 0000000..fdf0475 --- /dev/null +++ b/src/construction.jl @@ -0,0 +1,303 @@ +################################################ +# Trait dispatch +################################################ + +"""Trait object which based on static dispatch determines how to recode from +the encoding of the source sequence to the encoding of the kmer""" +abstract type RecodingScheme end + +"We can copy the encoding straight from the source to the kmer" +struct Copyable <: RecodingScheme end + +"We can copy the encoding, then bitshift to create 4-bit encoding" +struct TwoToFour <: RecodingScheme end + +"We skip all symbols in source that contain unmappable symbols" +struct FourToTwo <: RecodingScheme end + +"We can use `BioSequences.ascii_encode`" +struct AsciiEncode <: RecodingScheme end + +"The source is a bytevector, and we have no static knowledge of efficient +conversion to the right encoding" +struct GenericRecoding <: RecodingScheme end + +""" + is_ascii(::Type{T})::Bool + +Trait function. Should return `true` for `AbstractVector{UInt8}`, or for +string types for which `codeunits(s)` returns an `AbstractVector{UInt8}`, where +every ASCII byte in the string is perserved in the vector. +This is true for all UTF8, latin1 and ASCII encoded string types. +""" +is_ascii(::Type) = false +is_ascii(::Type{<:Union{String, SubString{String}}}) = true +is_ascii(::Type{<:AbstractVector{UInt8}}) = true + +function RecodingScheme(A::Alphabet, source_type::Type)::RecodingScheme + return if source_type <: BioSequence + if BioSequences.encoded_data_eltype(source_type) <: BitInteger + As = Alphabet(source_type) + if As == A + Copyable() + elseif As isa TwoBit && A isa TwoBit + Copyable() + elseif As isa FourBit && A isa FourBit + Copyable() + elseif As isa FourBit && A isa TwoBit + FourToTwo() + elseif As isa TwoBit && A isa FourBit + TwoToFour() + else + GenericRecoding() + end + else + GenericRecoding() + end + elseif is_ascii(source_type) && BioSequences.codetype(A) isa BioSequences.AsciiAlphabet + AsciiEncode() + else + GenericRecoding() + end +end + +################################################ +# Unsafe extract +################################################ + +"Extract a full kmer at a given index of a sequence. +Note: These methods don't do any bounds checking" +function unsafe_extract end + +@inline function unsafe_extract( + ::TwoToFour, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + for i in 1:ksize(T) + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(seq, i))) + (_, data) = leftshift_carry(data, 4, encoding) + end + T(unsafe, data) +end + +@inline function unsafe_extract( + ::FourToTwo, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + for i in 1:ksize(T) + encoding = UInt(BioSequences.extract_encoded_element(seq, i))::UInt + if count_ones(encoding) != 1 + throw( + BioSequences.EncodeError( + Alphabet(T), + reinterpret(eltype(seq), encoding % UInt8), + ), + ) + end + (_, data) = leftshift_carry(data, 2, trailing_zeros(encoding) % UInt) + end + T(unsafe, data) +end + +@inline function unsafe_extract( + ::Copyable, + ::Type{T}, + seq::BioSequence, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(seq)) + for i in from_index:(from_index + ksize(T) - 1) + encoding = UInt(BioSequences.extract_encoded_element(seq, i))::UInt + (_, data) = leftshift_carry(data, bps, encoding) + end + T(unsafe, data) +end + +@inline function unsafe_extract( + ::AsciiEncode, + ::Type{T}, + seq::AbstractVector{UInt8}, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(T)) + @inbounds for i in from_index:(from_index + ksize(T) - 1) + byte = seq[i] + encoding = BioSequences.ascii_encode(Alphabet(T), byte) + if encoding > 0x7f + throw(BioSequences.EncodeError(Alphabet(T), byte)) + end + (_, data) = leftshift_carry(data, bps, encoding % UInt) + end + T(unsafe, data) +end + +@inline function unsafe_extract( + ::GenericRecoding, + ::Type{T}, + seq, + from_index, +) where {T <: Kmer} + data = zero_tuple(T) + bps = BioSequences.bits_per_symbol(Alphabet(T)) + @inbounds for i in 1:ksize(T) + symbol = convert(eltype(T), seq[i]) + encoding = UInt(BioSequences.encode(Alphabet(T), symbol)) + (_, data) = leftshift_carry(data, bps, encoding) + end + T(unsafe, data) +end + +################################################ +# Constructors with full parameterisation +################################################ + +function Kmer{A, K, N}(x) where {A, K, N} + check_kmer(Kmer{A, K, N}) + build_kmer(RecodingScheme(A(), typeof(x)), Kmer{A, K, N}, x) +end + +# BioSequences support indexing and fast length checks +@inline function build_kmer(R::RecodingScheme, ::Type{T}, s::BioSequence) where {T} + length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") + unsafe_extract(R, T, s, 1) +end + +# LongSequence with compatible alphabet: Extract whole coding elements +@inline function build_kmer(::Copyable, ::Type{T}, s::LongSequence) where {T} + length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") + bps = BioSequences.BitsPerSymbol(Alphabet(T)) + data = ntuple(i -> BioSequences.reversebits(@inbounds(s.data[i]), bps), Val{nsize(T)}()) + (_, data) = rightshift_carry(data, bits_unused(T), zero(UInt)) + T(unsafe, data) +end + +# TODO: LongSubSeq with compatible alphabet +# Note: LongSequence may be UInt64 whereas kmers use UInt32 + +# For UTF8-strings combined with an ASCII kmer alphabet, we convert to byte vector +@inline function build_kmer( + R::AsciiEncode, + ::Type{T}, + s::Union{String, SubString{String}}, +) where {T} + build_kmer(R, T, codeunits(s)) +end + +# For byte vectors, we can build a kmer iff the kmer alphabet is AsciiAlphabet +@inline function build_kmer(R::AsciiEncode, ::Type{T}, s::AbstractVector{UInt8}) where {T} + length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") + unsafe_extract(R, T, s, 1) +end + +# The generic fallback - dispatch on whether we can check length once +@inline function build_kmer(R::RecodingScheme, T::Type, s) + build_kmer(Base.IteratorSize(typeof(s)), R, T, s) +end + +@inline function build_kmer(::Base.SizeUnknown, ::RecodingScheme, T::Type, s) + data = zero_tuple(T) + A = Alphabet(T) + bps = BioSequences.bits_per_symbol(A) + i = 0 + for element in s + i += 1 + i > ksize(T) && error("Length of sequence must be K elements to build Kmer") + symbol = convert(eltype(A), element) + carry = UInt(BioSequences.encode(A, symbol)) + (_, data) = leftshift_carry(data, bps, carry) + end + i == ksize(T) || error("Length of sequence must be K elements to build Kmer") + T(unsafe, data) +end + +@inline function build_kmer( + ::Union{Base.HasLength, Base.HasShape}, + ::RecodingScheme, + T::Type, + s, +) + length(s) == ksize(T) || error("Length of sequence must be K elements to build Kmer") + data = zero_tuple(T) + A = Alphabet(T) + bps = BioSequences.bits_per_symbol(A) + for element in s + symbol = convert(eltype(A), element) + carry = UInt(BioSequences.encode(A, symbol)) + (_, data) = leftshift_carry(data, bps, carry) + end + T(unsafe, data) +end + +################################################ +# Derived constructors +################################################ + +Kmer{A, K}(x) where {A, K} = derive_type(Kmer{A, K})(x) +Kmer{A1}(x::Kmer{A2, K, N}) where {A1, A2, K, N} = Kmer{A1, K, N}(x) + +function kmer(::Val{K}, s::BioSequence{A}) where {A, K} + K isa Int || error("K must be an Int") + Kmer{A, K}(s) +end + +################################################ +# Construct other types from Kmers +################################################ + +# TODO: LongSequence +# TODO: String + +################################################ +# String literals +################################################ + +""" + @mer_str -> Kmer + +Construct a `Kmer` from the given string. The macro must be used with a flag +after the input string, e.g. `d` in `mer"TAG"d` or `a` in `mer"PCW"a`, signifying +the alphabet of the kmer. +The flags `d = DNAAlphabet{2}`, `r = RNAAlphabet{2}` and `a = AminoAcidAlphabet` +are recognized. + +Because the macro is resolved and the kmer is created at parse time, +the macro is type stable, and may be used in high performance code. + +# Examples +```jldoctest +julia> mer"UGCUA"r +RNA 5-mer: +UGCUA + +julia> mer"YKVSTEDLLKKR"a +AminoAcid 12-mer: +YKVSTEDLLKKR + +julia> mer"TATTAGCA"d +DNA 8-mer: +TATTAGCA +``` +""" +macro mer_str(seq, flag) + trimmed = BioSequences.remove_newlines(seq) + ncu = ncodeunits(trimmed) + # Unlike @dna_str, we default to 2-bit alphabets, because kmers + # by convention are usually 2-bit only + if flag == "dna" || flag == "d" + Kmer{DNAAlphabet{2}, ncu}(trimmed) + elseif flag == "rna" || flag == "r" + Kmer{RNAAlphabet{2}, ncu}(trimmed) + elseif flag == "aa" || flag == "a" + Kmer{AminoAcidAlphabet, ncu}(trimmed) + else + error("Invalid type flag: '$(flag)'") + end +end diff --git a/src/counting.jl b/src/counting.jl deleted file mode 100644 index cbd9340..0000000 --- a/src/counting.jl +++ /dev/null @@ -1,47 +0,0 @@ -### -### Mer specific specializations of src/biosequence/counting.jl -### - -for i in [(:_count_a, :a_bitcount), (:_count_c, :c_bitcount), (:_count_g, :g_bitcount), (:_count_t, :t_bitcount)] - @eval begin - @inline function $(i[1])(alph::A, head::UInt64, tail...) where {A<:NucleicAcidAlphabet} - return $BioSequences.$(i[2])(head, alph) + $(i[1])(alph, tail...) - end - @inline $(i[1])(alph::A) where {A<:NucleicAcidAlphabet} = 0 - end -end - -@inline function _count_gc(alph::A, head::UInt64, tail...) where {A<:NucleicAcidAlphabet} - return BioSequences.gc_bitcount(head, alph) + _count_gc(alph, tail...) -end -@inline _count_gc(::A) where {A<:NucleicAcidAlphabet} = 0 - -count_a(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_a(A(), x.data...) - n_unused(x) -count_c(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_c(A(), x.data...) -count_g(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_g(A(), x.data...) -count_t(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_t(A(), x.data...) - -count_gc(x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = _count_gc(A(), x.data...) -Base.count(::typeof(isGC), x::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} = count_gc(x) - -# TODO: Expand to Amino Acid Kmers as well... -@inline function Base.count(::typeof(!=), a::Kmer{A,K,N}, b::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} - ad = a.data - bd = b.data - sum = 0 - @inbounds for i in 1:N - sum += BioSequences.mismatch_bitcount(ad[i], bd[i], A()) - end - return sum -end - -# TODO: Expand to Amino Acid Kmers as well... -@inline function Base.count(::typeof(==), a::Kmer{A,K,N}, b::Kmer{A,K,N}) where {A<:NucleicAcidAlphabet,K,N} - ad = a.data - bd = b.data - sum = 0 - @inbounds for i in 1:N - sum += BioSequences.match_bitcount(ad[i], bd[i], A()) - end - return sum - n_unused(a) -end \ No newline at end of file diff --git a/src/indexing.jl b/src/indexing.jl index 021926a..9a406e6 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -1,35 +1,81 @@ -@inline BioSequences.encoded_data_eltype(::Type{<:Kmer}) = UInt64 - @inline function BioSequences.extract_encoded_element(seq::Kmer, i::Integer) - bi = BioSequences.bitindex(seq, i % UInt) - return BioSequences.extract_encoded_element(bi, seq.data) + T = typeof(seq) + bps = BioSequences.bits_per_symbol(Alphabet(seq)) % UInt + index = div((i + n_unused(T) - 1) % UInt, per_word_capacity(T) % UInt) + 1 + offset = mod(((elements_in_head(T) - i) * bps) % UInt, 8 * sizeof(UInt)) + mask = UInt(1) << bps - 1 + right_shift(@inbounds(seq.data[index]), offset) & mask end -@inline Base.copy(seq::Kmer) = typeof(seq)(seq.data) - -@inline encoded_data(x::Kmer) = x.data - -@inline BioSequences.bitindex(seq::Kmer, i::Integer) = BioSequences.bitindex(BioSequences.BitsPerSymbol(seq), BioSequences.encoded_data_eltype(typeof(seq)), i + n_unused(seq)) +# This is usually type unstable, but in user code, users may use constant-folded ranges, +# e.g. f(x) = x[2:4]. In this case, we need it to compile to very efficient code. +# Hence, it MUST use @inline +@inline function Base.getindex(kmer::Kmer{A}, range::AbstractUnitRange{<:Integer}) where {A} + @boundscheck checkbounds(kmer, range) + K = length(range) + iszero(K) && return Kmer{A, 0, 0}(unsafe, ()) + (i1, _) = BioSequences.bitindex(kmer, first(range)) + (i2, o2) = BioSequences.bitindex(kmer, last(range)) + data = kmer.data[i1:i2] + (_, data) = rightshift_carry(data, o2, zero(UInt)) + T = derive_type(Kmer{A, K}) + N = nsize(T) + # After the shift, the first coding element may be unused + new_data = if N < length(data) + tail(data) + else + data + end + T(unsafe, (first(new_data) & get_mask(T), tail(new_data)...)) +end +# Same as above: This needs to be able to inline if the indices are known statically +@inline function Base.getindex(kmer::Kmer{A}, indices::AbstractVector{Bool}) where {A} + @boundscheck checkbounds(eachindex(kmer), indices) + K = sum(indices) + N = n_coding_elements(Kmer{A, K}) + T = Kmer{A, K, N} + data = zero_tuple(T) + nbits = BioSequences.bits_per_symbol(A()) + for (i, bool) in enumerate(indices) + bool || continue + (_, data) = + leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) + end + T(unsafe, data) +end -""" -Base.getindex(seq::Kmer, i::UnitRange) +function Base.getindex(kmer::Kmer{A}, indices::AbstractVector{<:Integer}) where {A} + K = length(indices) + N = n_coding_elements(Kmer{A, K}) + T = Kmer{A, K, N} + data = zero_tuple(T) + nbits = BioSequences.bits_per_symbol(A()) + for i in indices + checkbounds(kmer, i) + (_, data) = + leftshift_carry(data, nbits, BioSequences.extract_encoded_element(kmer, i)) + end + T(unsafe, data) +end -Slice a Kmer by a UnitRange. +@inline function BioSequences.bitindex(kmer::Kmer, i::Integer)::Tuple{UInt, UInt} + bps = BioSequences.bits_per_symbol(kmer) % UInt + bpe = (8 * sizeof(UInt)) % UInt + (i, o) = divrem((UInt(i) - UInt(1) + n_unused(typeof(kmer))) * bps, bpe) + o = bpe - o - bps + i + 1, o +end -!!! warning - Using this function will introduce performance penalties in your code if - you pass values of `i` that are not constants that can be propagated. -""" -@inline function Base.getindex(seq::Kmer{A}, i::UnitRange) where A - @boundscheck Base.checkbounds(seq, i) - ind(s, i) = BioSequences.index(BioSequences.bitindex(s, i)) - off(s, i) = BioSequences.offset(BioSequences.bitindex(s, i)) - isempty(i) && return Kmer{A, 0, 0}(()) - rshift = (64 - off(seq, last(i) + 1)) & 63 - stop = ind(seq, last(i)) - start = BioSequences.index(BioSequences.bitindex(seq, first(i)) + rshift) - data = Kmers.rightshift_carry(seq.data, rshift) - T = Kmers.kmertype(Kmer{A, length(i)}) - return T(data[start:stop]) -end \ No newline at end of file +@inline function setindex(kmer::Kmer, i::Integer, s) + @boundscheck checkbounds(kmer, i) + bps = BioSequences.bits_per_symbol(kmer) + symbol = convert(eltype(kmer), s) + encoding = UInt(BioSequences.encode(Alphabet(kmer), symbol)) + (i, o) = BioSequences.bitindex(kmer, i % UInt) + element = @inbounds kmer.data[i] + mask = left_shift(UInt(1) << bps - 1, o) + element &= ~mask + element |= left_shift(encoding, o) + typeof(kmer)(unsafe, @inbounds Base.setindex(kmer.data, element, i)) +end diff --git a/src/iterators/CanonicalKmers.jl b/src/iterators/CanonicalKmers.jl new file mode 100644 index 0000000..87a8c8e --- /dev/null +++ b/src/iterators/CanonicalKmers.jl @@ -0,0 +1,220 @@ +""" + FwRvIterator{A <: NucleicAcidAlphabet, K, S} + +Iterates 2-tuples of `(forward, reverse_complement)` kmers of type `Kmer{A, K}`. +`S` signifies the type of the underlying sequence, + +See also: [`FwKmers`](@ref), [`CanonicalKmers`](@ref) + +# Examples: +```jldoctest +julia> collect(FwRvIterator{DNAAlphabet{4}, 3}("AGCGT")) +3-element Vector{Tuple{Mer{3, DNAAlphabet{4}, 1}, Mer{3, DNAAlphabet{4}, 1}}}: + (AGC, GCT) + (GCG, CGC) + (CGT, ACG) + +julia> collect(FwRvIterator{DNAAlphabet{2}, 3}("AGNGT")) +ERROR: cannot encode 0x4e (Char 'N') in DNAAlphabet{2} +[...] +``` +""" +struct FwRvIterator{A <: NucleicAcidAlphabet, K, S} + seq::S + + function FwRvIterator{A, K, S}(seq::S) where {A, K, S} + K isa Int || error("K must be an Int") + K > 0 || error("K must be at least 1") + new{A, K, S}(seq) + end +end + +source_type(::Type{FwRvIterator{A, K, S}}) where {A, K, S} = S +kmertype(::Type{<:FwRvIterator{A, K}}) where {A, K} = derive_type(Kmer{A, K}) +kmertype(it::FwRvIterator) = kmertype(typeof(it)) +Base.eltype(T::Type{<:FwRvIterator{A, K}}) where {A, K} = + Tuple{K, K} where {K <: kmertype(T)} + +@inline function Base.length(it::FwRvIterator{A, K, S}) where {A, K, S} + src = used_source(RecodingScheme(A(), S), it.seq) + max(0, length(src) - K + 1) +end + +FwRvIterator{A, K}(s) where {A <: Alphabet, K} = FwRvIterator{A, K, typeof(s)}(s) + +@inline function Base.iterate(it::FwRvIterator{A, K, S}, state...) where {A, K, S} + iterate_kmer(RecodingScheme(A(), S), it, state...) +end + +# For the first kmer, we extract it, then reverse complement. +# When it's not done incrementally, it's faster to RC the whole +# kmer at once. +@inline function iterate_kmer(R::RecodingScheme, it::FwRvIterator{A, K}) where {A, K} + length(it.seq) < K && return nothing + fw = unsafe_extract(R, kmertype(it), it.seq, 1) + rv = reverse_complement(fw) + ((fw, rv), (fw, rv, K + 1)) +end + +# Here, we need to convert to an abstractvector +@inline function iterate_kmer( + R::AsciiEncode, + it::FwRvIterator{A, K, S}, +) where {A <: NucleicAcidAlphabet, K, S} + src = used_source(RecodingScheme(A(), S), it.seq) + Base.require_one_based_indexing(src) + length(src) < K && return nothing + fw = unsafe_extract(R, kmertype(it), src, 1) + rv = reverse_complement(fw) + ((fw, rv), (fw, rv, K + 1)) +end + +@inline function iterate_kmer( + ::GenericRecoding, + it::FwRvIterator, + state::Tuple{Kmer, Kmer, Int}, +) + (fw, rv, i) = state + i > length(it.seq) && return nothing + symbol = convert(eltype(fw), @inbounds it.seq[i]) + fw = shift(fw, symbol) + rv = shift_first(rv, complement(symbol)) + ((fw, rv), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::Copyable, + it::FwRvIterator{<:TwoBit, K, <:BioSequence{<:TwoBit}}, + state::Tuple{Kmer, Kmer, Int}, +) where {K} + (fw, rv, i) = state + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i)) + fw = shift_encoding(fw, encoding) + rv = shift_first_encoding(rv, encoding ⊻ 0x03) + ((fw, rv), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::Copyable, + it::FwRvIterator{<:FourBit, K, <:BioSequence{<:FourBit}}, + state::Tuple{Kmer, Kmer, Int}, +) where {K} + (fw, rv, i) = state + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i)) + fw = shift_encoding(fw, encoding) + rc_encoding = + reinterpret(UInt8, complement(reinterpret(eltype(rv), encoding % UInt8))) % UInt + rv = shift_first_encoding(rv, rc_encoding) + ((fw, rv), (fw, rv, i + 1)) +end + +@inline function iterate_kmer(::TwoToFour, it::FwRvIterator, state::Tuple{Kmer, Kmer, Int}) + (fw, rv, i) = state + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i)) + fw = shift_encoding(fw, left_shift(UInt(1), encoding)) + rv = shift_first_encoding(rv, left_shift(UInt(1), encoding ⊻ 0x03)) + ((fw, rv), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::FourToTwo, + it::FwRvIterator{A, K, <:BioSequence}, + state::Tuple{Kmer, Kmer, Int}, +) where {A, K} + (fw, rv, i) = state + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt + if count_ones(encoding) != 1 + throw( + BioSequences.EncodeError( + Alphabet(fw), + reinterpret(eltype(it.seq), encoding % UInt8), + ), + ) + end + enc = trailing_zeros(encoding) % UInt + fw = shift_encoding(fw, enc) + rv = shift_first_encoding(rv, enc ⊻ 0x03) + ((fw, rv), (fw, rv, i + 1)) +end + +@inline function iterate_kmer( + ::AsciiEncode, + it::FwRvIterator{A}, + state::Tuple{Kmer, Kmer, Int}, +) where {A} + src = used_source(RecodingScheme(A(), source_type(typeof(it))), it.seq) + Base.require_one_based_indexing(src) + (fw, rv, i) = state + i > length(src) && return nothing + byte = @inbounds src[i] + encoding = BioSequences.ascii_encode(A(), byte) + if encoding > 0x7f + throw(BioSequences.EncodeError(A(), repr(byte))) + end + # Hopefully this branch is eliminated at compile time... + rc_encoding = if Alphabet(fw) isa FourBit + reinterpret(UInt8, complement(reinterpret(DNA, encoding))) + elseif Alphabet(fw) isa TwoBit + encoding ⊻ 0x03 + else + error("Unreachable") + end + fw = shift_encoding(fw, encoding % UInt) + rv = shift_first_encoding(rv, rc_encoding % UInt) + ((fw, rv), (fw, rv, i + 1)) +end + +""" + CanonicalKmers{A <: NucleicAcidAlphabet, K, S} <: AbstractKmerIterator{A, K} + +Iterator of canonical nucleic acid kmers. The result of this iterator is equivalent +to calling `canonical` on each value of a `FwKmers` iterator, but may be more +efficient. + +!!! note + When counting small kmers, it may be more efficient to count `FwKmers`, + then call `canonical` only once per unique kmer. + +Can be constructed more conventiently with the constructors `CanonicalDNAMers{K}(s)` +`CanonicalRNAMers{K}(s)` + +# Examples: +```jldoctest +julia> collect(CanonicalRNAMers{3}("AGCGA")) +3-element Vector{Kmer{RNAAlphabet{2}, 3, 1}}: + AGC + CGC + CGA +``` +""" +struct CanonicalKmers{A <: NucleicAcidAlphabet, K, S} <: AbstractKmerIterator{A, K} + it::FwRvIterator{A, K, S} +end + +source_type(::Type{CanonicalKmers{A, K, S}}) where {A, K, S} = S +@inline Base.length(it::CanonicalKmers) = length(it.it) + +# Constructors +function CanonicalKmers{A, K}(s::S) where {S, A <: NucleicAcidAlphabet, K} + CanonicalKmers{A, K, S}(FwRvIterator{A, K}(s)) +end +function CanonicalKmers{A, K, S}(s::S) where {S, A <: NucleicAcidAlphabet, K} + CanonicalKmers{A, K, S}(FwRvIterator{A, K}(s)) +end + +"`CanonicalDNAMers{K, S}`: Alias for `CanonicalKmers{DNAAlphabet{2}, K, S}`" +const CanonicalDNAMers{K, S} = CanonicalKmers{DNAAlphabet{2}, K, S} + +"`CanonicalRNAMers{K, S}`: Alias for `CanonicalKmers{RNAAlphabet{2}, K, S}`" +const CanonicalRNAMers{K, S} = CanonicalKmers{RNAAlphabet{2}, K, S} + +@inline function Base.iterate(it::CanonicalKmers{A, K, S}, state...) where {A, K, S} + it = iterate(it.it, state...) + isnothing(it) && return nothing + ((fw, rv), state) = it + (fw < rv ? fw : rv, state) +end diff --git a/src/iterators/FwKmers.jl b/src/iterators/FwKmers.jl new file mode 100644 index 0000000..f199a14 --- /dev/null +++ b/src/iterators/FwKmers.jl @@ -0,0 +1,131 @@ +""" + FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} + +Iterator of forward kmers. `S` signifies the type of the underlying sequence, +and the eltype of the iterator is `Kmer{A, K, N}` with the appropriate `N`. + +Can be constructed more conventiently with the constructors `FwDNAMers{K}(s)` +and similar also for `FwRNAMers` and `FwAAMers`. + +# Examples: +```jldoctest +julia> v = collect(FwDNAMers{3}("AGCGTATA")); + +julia> eltype(v), length(v) +(Kmer{DNAAlphabet{2}, 3, 1}, 6) + +julia> collect(FwRNAMers{3}(rna"UGCDUGAVC")) +ERROR: cannot encode D in RNAAlphabet{2} +``` +""" +struct FwKmers{A <: Alphabet, K, S} <: AbstractKmerIterator{A, K} + seq::S + + function FwKmers{A, K, S}(seq::S) where {A, K, S} + K isa Int || error("K must be an Int") + K > 0 || error("K must be at least 1") + new{A, K, S}(seq) + end +end + +source_type(::Type{FwKmers{A, K, S}}) where {A, K, S} = S + +@inline function Base.length(it::FwKmers{A, K, S}) where {A, K, S} + src = used_source(RecodingScheme(A(), S), it.seq) + max(0, length(src) - K + 1) +end + +# Constructors +FwKmers{A, K}(s) where {A <: Alphabet, K} = FwKmers{A, K, typeof(s)}(s) + +"`FwDNAMers{K, S}`: Alias for `FwKmers{DNAAlphabet{2}, K, S}`" +const FwDNAMers{K, S} = FwKmers{DNAAlphabet{2}, K, S} + +"`FwRNAMers{K, S}`: Alias for `FwKmers{RNAAlphabet{2}, K, S}`" +const FwRNAMers{K, S} = FwKmers{RNAAlphabet{2}, K, S} + +"`FwAAMers{K, S}`: Alias for `FwKmers{AminoAcidAlphabet, K, S}`" +const FwAAMers{K, S} = FwKmers{AminoAcidAlphabet, K, S} + +# TODO: Should this go in common? +@inline function Base.iterate(it::FwKmers{A, K, S}, state...) where {A, K, S} + iterate_kmer(RecodingScheme(A(), S), it, state...) +end + +# For the first kmer, we just forward to `unsafe_extract` +@inline function iterate_kmer(R::RecodingScheme, it::FwKmers) + length(it.seq) < ksize(eltype(it)) && return nothing + kmer = unsafe_extract(R, eltype(it), it.seq, 1) + (kmer, (kmer, ksize(eltype(it)) + 1)) +end + +# Here, we need to convert to an abstractvector +@inline function iterate_kmer( + R::AsciiEncode, + it::FwKmers{A, K, S}, +) where {A <: Alphabet, K, S} + src = used_source(RecodingScheme(A(), S), it.seq) + Base.require_one_based_indexing(src) + length(src) < K && return nothing + kmer = unsafe_extract(R, eltype(it), src, 1) + (kmer, (kmer, K + 1)) +end + +@inline function iterate_kmer(::GenericRecoding, it::FwKmers, state::Tuple{Kmer, Int}) + (kmer, i) = state + i > length(it.seq) && return nothing + symbol = @inbounds it.seq[i] + new_kmer = shift(kmer, convert(eltype(kmer), symbol)) + (new_kmer, (new_kmer, nextind(it.seq, i))) +end + +@inline function iterate_kmer(::Copyable, it::FwKmers, state::Tuple{Kmer, Int}) + (kmer, i) = state + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i)) + new_kmer = shift_encoding(kmer, encoding) + (new_kmer, (new_kmer, nextind(it.seq, i))) +end + +@inline function iterate_kmer(::TwoToFour, it::FwKmers, state::Tuple{Kmer, Int}) + (kmer, i) = state + i > length(it.seq) && return nothing + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(it.seq, i))) + new_kmer = shift_encoding(kmer, encoding) + (new_kmer, (new_kmer, nextind(it.seq, i))) +end + +@inline function iterate_kmer( + ::FourToTwo, + it::FwKmers{A, K, <:BioSequence}, + state::Tuple{Kmer, Int}, +) where {A, K} + (kmer, i) = state + i > length(it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt + # TODO: Abstract this into a function + if count_ones(encoding) != 1 + throw( + BioSequences.EncodeError( + Alphabet(kmer), + reinterpret(eltype(it.seq), encoding % UInt8), + ), + ) + end + kmer = shift_encoding(kmer, trailing_zeros(encoding) % UInt) + return (kmer, (kmer, nextind(it.seq, i))) +end + +@inline function iterate_kmer(::AsciiEncode, it::FwKmers, state::Tuple{Kmer, Int}) + src = used_source(RecodingScheme(Alphabet(eltype(it)), source_type(typeof(it))), it.seq) + Base.require_one_based_indexing(src) + (kmer, i) = state + i > length(src) && return nothing + byte = @inbounds src[i] + encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), byte) + if encoding > 0x7f + throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) + end + kmer = shift_encoding(kmer, encoding % UInt) + return (kmer, (kmer, nextind(src, i))) +end diff --git a/src/iterators/SpacedKmers.jl b/src/iterators/SpacedKmers.jl new file mode 100644 index 0000000..13504ad --- /dev/null +++ b/src/iterators/SpacedKmers.jl @@ -0,0 +1,175 @@ +""" + SpacedKmers{A <: Alphabet, K, J, S} <: AbstractKmerIterator{A, K} + +Iterator of kmers with step size. `J` signifies the step size, `S` +the type of the underlying sequence, and the eltype of the iterator +is `Kmer{A, K, N}` with the appropriate `N` + +See also: [`each_dna_codon`](@ref), [`FwKmers`](@ref) + +# Examples: +```jldoctest +julia> collect(SpacedDNAMers{3, 2}("AGCGTATA")) +3-element Vector{Kmer{DNAAlphabet{2}, 3, 1}}: + AGC + CGT + TAT +``` +""" +struct SpacedKmers{A <: Alphabet, K, J, S} <: AbstractKmerIterator{A, K} + seq::S + + function SpacedKmers{A, K, J, S}(seq::S) where {A, K, J, S} + K isa Int || error("K must be an Int") + K > 0 || error("K must be at least 1") + J isa Int || error("J must be an Int") + J > 0 || error("J must be at least 1") + new{A, K, J, S}(seq) + end +end + +source_type(::Type{SpacedKmers{A, K, J, S}}) where {A, K, J, S} = S +stepsize(::SpacedKmers{A, K, J}) where {A, K, J} = J + +@inline function Base.length(it::SpacedKmers{A, K, J}) where {A, K, J} + src = used_source(RecodingScheme(A(), source_type(typeof(it))), it.seq) + L = length(src) + L < K ? 0 : div((L - K), J) + 1 +end + +SpacedKmers{A, K, J}(s) where {A <: Alphabet, K, J} = SpacedKmers{A, K, J, typeof(s)}(s) + +"`SpacedDNAMers{K, J, S}`: Alias for `SpacedKmers{DNAAlphabet{2}, K, J, S}`" +const SpacedDNAMers{K, J, S} = SpacedKmers{DNAAlphabet{2}, K, J, S} + +"`SpacedRNAMers{K, J, S}`: Alias for `SpacedKmers{RNAAlphabet{2}, K, J, S}`" +const SpacedRNAMers{K, J, S} = SpacedKmers{RNAAlphabet{2}, K, J, S} + +"`SpacedAAMers{K, J, S}`: Alias for `SpacedKmers{AminoAcidAlphabet, K, J, S}`" +const SpacedAAMers{K, J, S} = SpacedKmers{AminoAcidAlphabet, K, J, S} + +# TODO: Do we need two function names for this?... +# Could it be each_codon(DNA, s) <- requires RNA/DNA to be exported +# each(DNACodon) <- awkward since DNACodon is merely an alias +""" + each_dna_codon(s) + +Construct an iterator of `DNACodon` from `s`, iterating over every `DNACodon` +in `s`, in-frame, i.e. with a step size of 3. + +See also: [`SpacedKmers`](@ref) + +Examples: +```jldoctest +julia> collect(each_dna_codon("TGACGATCGAC")) +3-element Vector{Kmer{DNAAlphabet{2}, 3, 1}}: + TGA + CGA + TCG +``` +""" +@inline each_dna_codon(s) = SpacedDNAMers{3, 3}(s) + +"The `RNA` equivalent of [`each_dna_codon`](@ref)" +@inline each_rna_codon(s) = SpacedRNAMers{3, 3}(s) + +@inline function Base.iterate(it::SpacedKmers{A}, state...) where {A} + iterate_kmer(RecodingScheme(A(), source_type(typeof(it))), it, state...) +end + +# TODO: Maybe in all kmer iterators, instantiate it with the source type, +# so we don't have to get the source type in functions (and thus +# it is allwoed to be a costly operation). +# However, this means we instantiate e.g. a FwKmers{A, K, S} and change S +# in the source type in the constructor +@inline function iterate_kmer( + R::RecodingScheme, + it::SpacedKmers{A, K}, +) where {A <: Alphabet, K} + length(it.seq) < K && return nothing + kmer = unsafe_extract( + R, + eltype(it), + used_source(RecodingScheme(A(), source_type(typeof(it))), it.seq), + 1, + ) + next_index = 1 + max(stepsize(it), K) + (kmer, (kmer, next_index)) +end + +# Here, we need to convert to an abstractvector +# TODO: This function and the one above can be merged with the FwKmers one? +@inline function iterate_kmer( + R::AsciiEncode, + it::SpacedKmers{A, K, J, S}, +) where {A <: Alphabet, K, J, S} + src = used_source(RecodingScheme(A(), S), it.seq) + Base.require_one_based_indexing(src) + length(src) < K && return nothing + kmer = unsafe_extract(R, eltype(it), src, 1) + next_index = 1 + max(stepsize(it), K) + (kmer, (kmer, next_index)) +end + +@inline function iterate_kmer( + ::RecodingScheme, + it::SpacedKmers{A, K, J, S}, + state, +) where {A, K, S, J} + src = used_source(RecodingScheme(A(), S), it.seq) + R = RecodingScheme(A(), S) + Base.require_one_based_indexing(src) + (kmer, i) = state + i > lastindex(src) - min(K, J) + 1 && return nothing + next_i = i + min(K, J) + # This branch should be resolved statically + if J ≥ K + kmer = unsafe_extract(R, eltype(it), src, i) + else + for _ in 1:J + kmer = update_kmer(R, it, kmer, i) + i += 1 + end + end + (kmer, (kmer, next_i)) +end + +# TODO: Can this function be used more generically, by the other iterators? +# I.e. this simply fetches a single element +@inline function update_kmer(::GenericRecoding, it::SpacedKmers, kmer::Kmer, i::Int) + symbol = @inbounds it.seq[i] + shift(kmer, convert(eltype(kmer), symbol)) +end + +@inline function update_kmer(::Copyable, it::SpacedKmers, kmer::Kmer, i::Int) + shift_encoding(kmer, UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt) +end + +@inline function update_kmer(::TwoToFour, it::SpacedKmers, kmer::Kmer, i::Int) + encoding = left_shift(UInt(1), UInt(BioSequences.extract_encoded_element(it.seq, i))) + shift_encoding(kmer, encoding) +end + +@inline function update_kmer(::FourToTwo, it::SpacedKmers, kmer::Kmer, i::Int) + encoding = UInt(BioSequences.extract_encoded_element(it.seq, i))::UInt + if count_ones(encoding) != 1 + throw( + BioSequences.EncodeError( + Alphabet(kmer), + reinterpret(eltype(it.seq), encoding % UInt8), + ), + ) + end + shift_encoding(kmer, trailing_zeros(encoding) % UInt) +end + +@inline function update_kmer(::AsciiEncode, it::SpacedKmers, kmer::Kmer, i::Int) + src = used_source(RecodingScheme(Alphabet(eltype(it)), source_type(typeof(it))), it.seq) + Base.require_one_based_indexing(src) + byte = @inbounds src[i] + encoding = BioSequences.ascii_encode(Alphabet(eltype(it)), byte) + if encoding > 0x7f + throw(BioSequences.EncodeError(Alphabet(eltype(it)), repr(byte))) + end + shift_encoding(kmer, encoding % UInt) +end diff --git a/src/iterators/UnambiguousKmers.jl b/src/iterators/UnambiguousKmers.jl new file mode 100644 index 0000000..4ee67aa --- /dev/null +++ b/src/iterators/UnambiguousKmers.jl @@ -0,0 +1,137 @@ +""" + UnambiguousKmers{A <: TwoBit, K, S} + +Iterator of 2-bit nucleic acid kmers. This differs from `FwKmers` in that any kmers +containing ambiguous nucleotides are skipped, whereas using `FwKmers`, they result +in an error. + +Can be constructed more conventiently with the constructors `UnambiguousDNAMers{K}(s)` +and `UnambiguousRNAMers{K}(s)`. + +!!! note + To obtain canonical unambiguous kmers, simply call `canonical` on each kmer output +by `UnambiguousKmers`. + +# Examples: +```jldoctest +julia> it = UnambiguousRNAMers{4}(dna"TGAGCWKCATC"); + +julia> collect(it) +3-element Vector{Tuple{Kmer{RNAAlphabet{2}, 4, 1}, Int64}}: + (UGAG, 1) + (GAGC, 2) + (CAUC, 8) +``` +""" +struct UnambiguousKmers{A <: TwoBit, K, S} + it::FwKmers{A, K, S} +end + +Base.IteratorSize(::Type{<:UnambiguousKmers}) = Base.SizeUnknown() +function Base.eltype(::Type{<:UnambiguousKmers{A, K}}) where {A, K} + Tuple{derive_type(Kmer{A, K}), Int} +end + +source_type(::Type{UnambiguousKmers{A, K, S}}) where {A, K, S} = S + +# Constructors +function UnambiguousKmers{A, K}(s::S) where {S, A <: TwoBit, K} + UnambiguousKmers{A, K, S}(FwKmers{A, K}(s)) +end +function UnambiguousKmers{A, K, S}(s::S) where {S, A <: TwoBit, K} + UnambiguousKmers{A, K, S}(FwKmers{A, K}(s)) +end + +"`UnambiguousDNAMers{K, S}`: Alias for `UnambiguousKmers{DNAAlphabet{2}, K, S}`" +const UnambiguousDNAMers{K, S} = UnambiguousKmers{DNAAlphabet{2}, K, S} + +"`UnambiguousRNAMers{K, S}`: Alias for `UnambiguousKmers{RNAAlphabet{2}, K, S}`" +const UnambiguousRNAMers{K, S} = UnambiguousKmers{RNAAlphabet{2}, K, S} + +@inline function Base.iterate(it::UnambiguousKmers{A, K, S}) where {A, K, S} + T = derive_type(Kmer{A, K}) + state = (T(unsafe, zero_tuple(T)), K, 1) + iterate_kmer(RecodingScheme(A(), S), it, state) +end + +@inline function Base.iterate(it::UnambiguousKmers{A, K, S}, state) where {A, K, S} + iterate_kmer(RecodingScheme(A(), S), it, state) +end + +@inline function iterate_kmer( + ::RecodingScheme, + it::UnambiguousKmers, + state::Tuple{Kmer, Int, Int}, +) + (kmer, remaining, index) = state + K = ksize(kmer) + while !iszero(remaining) + index > lastindex(it.it.seq) && return nothing + symbol = convert(typeof(kmer), it.it.seq[index]) + index += 1 + if isambiguous(symbol) + remaining = K + else + remaining -= 1 + kmer = shift(kmer, symbol) + end + end + ((kmer, index - K), (kmer, 1, index)) +end + +# Here, we can forward directly to FwKmers +@inline function iterate_kmer( + ::Copyable, + it::UnambiguousKmers, + state::Tuple{Kmer, Int, Int}, +) + (kmer, _, index) = state + itval = iterate(it.it, (kmer, index)) + if itval === nothing + nothing + else + (kmer, s) = itval + ((kmer, index - ksize(kmer)), s) + end +end + +@inline function iterate_kmer( + ::AsciiEncode, + it::UnambiguousKmers{A, K, S}, + state::Tuple{Kmer, Int, Int}, +) where {A <: TwoBit, K, S} + src = used_source(RecodingScheme(A(), S), it.it.seq) + Base.require_one_based_indexing(src) + (kmer, remaining, index) = state + while !iszero(remaining) + index > lastindex(src) && return nothing + byte = @inbounds src[index] + index += 1 + encoding = @inbounds ASCII_SKIPPING_LUT[(byte + 0x01) % Int] + if encoding == 0xff + throw(BioSequences.EncodeError(Alphabet(kmer), repr(byte))) + elseif encoding == 0xf0 + remaining = K + else + remaining -= 1 + kmer = shift_encoding(kmer, encoding % UInt) + end + end + ((kmer, index - K), (kmer, 1, index)) +end + +@inline function iterate_kmer( + ::FourToTwo, + it::UnambiguousKmers{A, K, S}, + state::Tuple{Kmer, Int, Int}, +) where {A <: TwoBit, K, S} + (kmer, remaining, index) = state + while !iszero(remaining) + index > lastindex(it.it.seq) && return nothing + encoding = UInt(BioSequences.extract_encoded_element(it.it.seq, index))::UInt + kmer = shift_encoding(kmer, (trailing_zeros(encoding)) % UInt) + index += 1 + remaining = isone(count_ones(encoding)) ? remaining - 1 : K + end + ((kmer, index - K), (kmer, 1, index)) +end diff --git a/src/iterators/common.jl b/src/iterators/common.jl new file mode 100644 index 0000000..cb9e992 --- /dev/null +++ b/src/iterators/common.jl @@ -0,0 +1,47 @@ +# TODO: Make sure to go through this docstring +""" + AbstractKmerIterator{A <: Alphabet, K} + +Abstract type for kmer iterators. The element type is `Kmer{A, K, N}`, +with the appropriately derived N. + +Iterates `Kmer{A, K}`. +Functions to implement: +* `Base.iterate` + +Optional functions: +* `source_type` +* `Base.IteratorSize`, if not `HasLength` +""" +abstract type AbstractKmerIterator{A <: Alphabet, K} end + +function Base.eltype(::Type{<:AbstractKmerIterator{A, K}}) where {A, K} + Kmer{A, K, n_coding_elements(Kmer{A, K})} +end + +""" + source_type(::Type{<:AbstractKmerIterator})::Type + +Get the type of the data source that kmers are extracted from +""" +function source_type end + +function used_source(::AsciiEncode, s::AbstractString) + is_ascii(typeof(s)) ? codeunits(s) : s +end +used_source(::RecodingScheme, s) = s + +@noinline throw_bad_byte_error(b::UInt8) = + error("Cannot interpret byte $(repr(b)) as nucleotide") + +const ASCII_SKIPPING_LUT = let + v = fill(0xff, 256) + for (i, s) in [(0, "Aa"), (1, "cC"), (2, "gG"), (3, "TtUu")], c in s + v[UInt8(c) + 1] = i + end + for c in "-MRSVWYHKDBN" + v[UInt8(c) + 1] = 0xf0 + v[UInt8(lowercase(c)) + 1] = 0xf0 + end + Tuple(v) +end diff --git a/src/kmer.jl b/src/kmer.jl index b71a57a..9421279 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -1,590 +1,480 @@ -### -### Kmer Type definition -### - -# Include some basic tuple bitflipping ops - the secret sauce to efficiently -# manipping Kmer's static data. -include("tuple_bitflipping.jl") - -""" - Kmers.Unsafe - -Trait object used to access unsafe methods of functions. -`unsafe` is the singleton of `Unsafe`. -""" -struct Unsafe end -const unsafe = Unsafe() - """ Kmer{A<:Alphabet,K,N} <: BioSequence{A} -A parametric, immutable, bitstype for representing Kmers - short sequences. -Given the number of Kmers generated from raw sequencing reads, avoiding -repetetive memory allocation and triggering of garbage collection is important, -as is the ability to effectively pack Kmers into arrays and similar collections. - -In practice that means we an immutable bitstype as the internal representation -of these sequences. Thankfully, this is not much of a limitation - kmers are -rarely manipulated and so by and large don't have to be mutable. - -Excepting their immutability, they fulfill the rest of the API and behaviours -expected from a concrete `BioSequence` type, and non-mutating transformations -of the type are still defined. - -!!! warning - Given their immutability, `setindex` and mutating sequence transformations - are not implemented for Kmers e.g. `reverse_complement!`. -!!! tip - Note that some sequence transformations that are not mutating are - available, since they can return a new kmer value as a result e.g. - `reverse_complement`. -""" -struct Kmer{A<:Alphabet,K,N} <: BioSequence{A} - data::NTuple{N,UInt64} - - # This unsafe method do not clip the head - Kmer{A,K,N}(::Unsafe, data::NTuple{N,UInt64}) where {A<:Alphabet,K,N} = new{A,K,N}(data) +An immutable bitstype for representing k-mers - short `BioSequences` +of a fixed length `K`. +Since they can be stored directly in registers, `Kmer`s are generally the most +efficient type of `BioSequence`, when `K` is small and known at compile time. - function Kmer{A,K,N}(data::NTuple{N,UInt64}) where {A<:Alphabet,K,N} - checkmer(Kmer{A,K,N}) - x = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(A()) - return new(_cliphead(x, data...)) - end -end +The `N` parameter is derived from `A` and `K` and is not a free parameter. -BioSequences.encoded_data(seq::Kmer{A,K,N}) where {A,K,N} = seq.data +See also: [`DNAKmer`](@ref), [`RNAKmer`](@ref), [`AAKmer`](@ref), [`AbstractKmerIterator`](@ref) -# Create a blank ntuple of appropriate length for a given Kmer with N. -@inline blank_ntuple(::Type{Kmer{A,K,N}}) where {A,K,N} = ntuple(x -> zero(UInt64), Val{N}()) +# Examples +```jldoctest +julia> RNAKmer{5}("ACGUC") +RNA 5-mer: +ACGUC -### -### _build_kmer_data -### +julia> Kmer{DNAAlphabet{4}, 6}(dna"TGCTTA") +DNA 6-mer: +TGCTTA -#= -These are (hopefully!) very optimised kernel functions for building kmer internal -data from individual elements or from sequences. Kmers themselves are static, -tuple-based structs, and so I really didn't want these functions to create memory -allocations or GC activity through use of vectors an such, for what should be -the creation of a single, rather simple value. -=# +julia> AAKmer{5}((lowercase(i) for i in "KLWYR")) +AminoAcid 5-mer: +KLWYR +julia> RNAKmer{3}("UAUC") # wrong length +ERROR: +[...] +``` """ - _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} - -Construct a ntuple of the bits data for an instance of a Kmer{A,K,N}. +struct Kmer{A <: Alphabet, K, N} <: BioSequence{A} + # The number of UInt is always exactly the number needed, no less, no more. + # The first symbols pack into the first UInts + # An UInt with N elements pack into the lowest bits of the UInt, with the + # first symbols in the higher parts of the UInt. + # Hence, a sequence A-G of 16-bit elements would pack like: + # ( ABC, DEFG) + # ^ 16 unused bits, the unused bits are always top bits of first UInt + # Unused bits are always zero + + # This layout complicates some Kmer construction code, but simplifies comparison + # operators, and we really want Kmers to be efficient. + data::NTuple{N, UInt} -This particular method is specialised for LongSequences, and for when the Kmer -and LongSequence types used, share the same alphabet, since a lot of encoding / -decoding can be skipped, and the problem is mostly one of shunting bits around. -""" -@inline function _build_kmer_data(::Type{Kmer{A,K,N}}, seq::LongSequence{A}, from::Int = 1) where {A,K,N} - checkmer(Kmer{A,K,N}) - - bits_per_sym = BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - n_head = elements_in_head(Kmer{A,K,N}) # Based on kmer type, should constant fold. - n_per_chunk = per_word_capacity(Kmer{A,K,N}) # Based on kmer type, should constant fold. - - if from + K - 1 > length(seq) - return nothing - end - - # Construct the head. - head = zero(UInt64) - @inbounds for i in from:(from + n_head - 1) - bits = UInt64(BioSequences.extract_encoded_element(seq, i)) - head = (head << bits_per_sym) | bits - end - - # And the rest of the sequence - idx = Ref(from + n_head) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for _ in 1:n_per_chunk - bits = UInt64(BioSequences.extract_encoded_element(seq, idx[])) - body = (body << bits_per_sym) | bits - idx[] += 1 - end - return body + # This unsafe method do not clip the head + function Kmer{A, K, N}(::Unsafe, data::NTuple{N, UInt}) where {A <: Alphabet, K, N} + check_kmer(Kmer{A, K, N}) + new{A, K, N}(data) end - - # Put head and tail together - return (head, tail...) end +# Useful to do e.g. `mer"TAG"d isa Mer{3}` +""" + Mer{K} +Alias for `Kmer{<:Alphabet, K}`. Useful to dispatch on `K-mers` without regard +for the alphabat -### -### Constructors -### +# Example +```jldoctest +julia> mer"DEKR"a isa Mer{4} +true +julia> DNAKmer{6}("TGATCA") isa Mer{6} +true + +julia> RNACodon <: Mer{3} +true +``` """ - Kmer{A,K,N}(itr) where {A,K,N} +const Mer{K} = Kmer{<:Alphabet, K} -Construct a `Kmer{A,K,N}` from an iterable. +# Aliases +"Alias for `Kmer{DNAAlphabet{2},K,N}`" +const DNAKmer{K, N} = Kmer{DNAAlphabet{2}, K, N} -The most generic constructor. +"Alias for `Kmer{RNAAlphabet{2},K,N}`" +const RNAKmer{K, N} = Kmer{RNAAlphabet{2}, K, N} -Currently the iterable must have `length` & support `getindex` with integers. +"Alias for `Kmer{AminoAcidAlphabet,K,N}`" +const AAKmer{K, N} = Kmer{AminoAcidAlphabet, K, N} -# Examples +"Alias for `DNAKmer{3,1}`" +const DNACodon = DNAKmer{3, 1} -```jldoctest -julia> ntseq = LongSequence("TTAGC") # 4-bit DNA alphabet -5nt DNA Sequence: -TTAGC +"Alias for `RNAKmer{3,1}`" +const RNACodon = RNAKmer{3, 1} -julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet -DNA 5-mer: -TTAGC -``` """ -function Kmer{A,K,N}(itr) where {A,K,N} - checkmer(Kmer{A,K,N}) - - seqlen = length(itr) - if seqlen != K - throw(ArgumentError("itr does not contain enough elements ($seqlen ≠ $K)")) - end - - ## All based on alphabet type of Kmer, so should constant fold. - bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A,K,N}) - n_per_chunk = per_word_capacity(Kmer{A,K,N}) - - # Construct the head. - head = zero(UInt64) - @inbounds for i in 1:n_head - (x, next_i) = iterate(itr, i) - sym = convert(eltype(Kmer{A,K,N}), x) - # Encode will throw if it cant encode an element. - head = (head << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) + check_kmer(::Type{Kmer{A,K,N}}) where {A,K,N} + +Internal methods that checks that the type parameters are good. + +This function should compile to a noop in case the parameterization is good. +""" +@inline function check_kmer(::Type{Kmer{A, K, N}}) where {A, K, N} + if !(K isa Int) + throw(ArgumentError("K must be an Int")) + elseif K < 0 + throw(ArgumentError("Bad kmer parameterisation. K must be greater than 0.")) end - - # And the rest of the sequence - idx = Ref(n_head + 1) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for i in 1:n_per_chunk - (x, next_idx) = iterate(itr, idx[]) - sym = convert(eltype(Kmer{A,K,N}), x) - # Encode will throw if it cant encode an element. - body = (body << bits_per_sym) | UInt64(BioSequences.encode(A(), sym)) - idx[] += 1 - end - return body + n = cld((K * BioSequences.bits_per_symbol(A())) % UInt, (sizeof(UInt) * 8) % UInt) % Int + if !(N isa Int) + throw(ArgumentError("N must be an Int")) + elseif n !== N + # This has been significantly changed conceptually from before. Now we + # don't just check K, but *enforce* the most appropriate N for K. + throw(ArgumentError("Bad kmer parameterisation. For K = $K, N should be $n")) end - - data = (head, tail...) - - return Kmer{A,K,N}(data) end -""" - Kmer{A,K,N}(seq::BioSequence{A}) - -Construct a `Kmer{A,K,N}` from a `BioSequence{A}`. +################################################ +# Compile-time functions computed on Kmer types +################################################ -This particular method is specialised for BioSequences, and for when the Kmer -and BioSequence types used, share the same alphabet, since a lot of encoding / -decoding can be skipped, and the problem is mostly one of shunting bits around. -In the case where the alphabet of the Kmer and the alphabet of the BioSequence -differ, dispatch to the more generic constructor occurs instead. +@inline ksize(::Type{<:Kmer{A, K}}) where {A, K} = K +@inline nsize(::Type{<:Kmer{A, K, N}}) where {A, K, N} = N +@inline n_unused(::Type{<:Kmer{A, K, N}}) where {A, K, N} = capacity(Kmer{A, K, N}) - K +@inline bits_unused(T::Type{<:Kmer}) = + n_unused(T) * BioSequences.bits_per_symbol(Alphabet(T)) -# Examples +@inline function n_coding_elements(::Type{<:Kmer{A, K}}) where {A, K} + cld(BioSequences.bits_per_symbol(A()) * K, 8 * sizeof(UInt)) +end -```jldoctest -julia> ntseq = LongSequence{DNAAlphabet{2}}("TTAGC") # 2-bit DNA alphabet -5nt DNA Sequence: -TTAGC +@inline function per_word_capacity(::Type{<:Kmer{A}}) where {A} + div(8 * sizeof(UInt), BioSequences.bits_per_symbol(A())) +end -julia> DNAKmer{5}(ntseq) # 2-Bit DNA alphabet -DNA 5-mer: -TTAGC -``` -""" -@inline function Kmer{A,K,N}(seq::BioSequence{A}) where {A,K,N} - checkmer(Kmer{A,K,N}) - - seqlen = length(seq) - if seqlen != K - throw(ArgumentError("seq is not the correct length ($seqlen ≠ $K)")) - end - - ## All based on alphabet type of Kmer, so should constant fold. - bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A,K,N}) - n_per_chunk = per_word_capacity(Kmer{A,K,N}) - - # Construct the head. - head = zero(UInt64) - @inbounds for i in 1:n_head - bits = UInt64(BioSequences.extract_encoded_element(seq, i)) - head = (head << bits_per_sym) | bits - end - - # And the rest of the sequence - idx = Ref(n_head + 1) - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for _ in 1:n_per_chunk - bits = UInt64(BioSequences.extract_encoded_element(seq, idx[])) - body = (body << bits_per_sym) | bits - idx[] += 1 - end - return body - end - - data = (head, tail...) - - return Kmer{A,K,N}(data) +@inline function capacity(::Type{<:Kmer{A, K, N}}) where {A, K, N} + per_word_capacity(Kmer{A, K, N}) * N end +@inline function elements_in_head(::Type{<:Kmer{A, K, N}}) where {A, K, N} + per_word_capacity(Kmer{A, K, N}) - n_unused(Kmer{A, K, N}) +end -# Convenience version of function above so you don't have to work out correct N. -""" - Kmer{A,K}(itr) where {A,K} +@inline derive_type(::Type{Kmer{A, K}}) where {A, K} = + Kmer{A, K, n_coding_elements(Kmer{A, K})} -Construct a `Kmer{A,K,N}` from an iterable. +@inline zero_tuple(T::Type{<:Kmer}) = ntuple(i -> zero(UInt), Val{nsize(T)}()) -This is a convenience method which will work out the correct `N` parameter, for -your given choice of `A` & `K`. -""" -@inline function Kmer{A,K}(itr) where {A,K} - T = kmertype(Kmer{A,K}) - return T(itr) +@inline function zero_kmer(::Type{<:Kmer{A, K}}) where {A, K} + T2 = derive_type(Kmer{A, K}) + T2(unsafe, zero_tuple(T2)) end -""" - Kmer{A}(itr) where {A} +################## +# Various methods +################## -Construct a `Kmer{A,K,N}` from an iterable. +# BioSequences interface +Base.length(x::Kmer) = ksize(typeof(x)) +Base.copy(x::Kmer) = x # immutable +BioSequences.encoded_data_eltype(::Type{<:Kmer}) = UInt -This is a convenience method which will work out K from the length of `itr`, and -the correct `N` parameter, for your given choice of `A` & `K`. +# BioSequences helper methods +BioSequences.encoded_data(seq::Kmer) = seq.data -!!! warning - Since this gets K from runtime values, this is gonna be slow! -""" -@inline Kmer{A}(itr) where {A} = Kmer{A,length(itr)}(itr) -@inline Kmer(seq::BioSequence{A}) where A = Kmer{A}(seq) +# Misc methods +Base.summary(x::Kmer{A, K, N}) where {A, K, N} = string(eltype(x), ' ', K, "-mer") -function Kmer{A1}(seq::BioSequence{A2}) where {A1 <: NucleicAcidAlphabet, A2 <: NucleicAcidAlphabet} - kmertype(Kmer{A1, length(seq)})(seq) +function Base.show(io::IO, ::MIME"text/plain", s::Kmer) + println(io, summary(s), ':') + print(io, s) end -@inline function Kmer{A}(nts::Vararg{Union{DNA, RNA}, K}) where {A <: NucleicAcidAlphabet, K} - return kmertype(Kmer{A, K})(nts) +# TODO: This is only efficient because the compiler, through Herculean effort, +# is able to completely unroll and inline the indexing operation. +@inline function _cmp(x::Kmer{A1, K1}, y::Kmer{A2, K2}) where {A1, A2, K1, K2} + if K1 == K2 + cmp(x.data, y.data) + else + m = min(K1, K2) + a = @inline x[1:m] + b = @inline y[1:m] + c = cmp(a.data, b.data) + if iszero(c) + K1 < K2 ? -1 : K2 < K1 ? 1 : 0 + else + c + end + end end +# Here, we don't allow comparing twobit to fourbit sequences. We could do this semantically, +# but this would open a whole can of worms, be impossible to optimise and defeat the purpose +# of using Kmers. +Base.cmp(x::Kmer{A}, y::Kmer{A}) where {A} = _cmp(x, y) +Base.cmp(x::Kmer{<:FourBit}, y::Kmer{<:FourBit}) = _cmp(x, y) +Base.cmp(x::Kmer{<:TwoBit}, y::Kmer{<:TwoBit}) = _cmp(x, y) +Base.cmp(x::Kmer{A}, y::Kmer{B}) where {A, B} = throw(MethodError(cmp, (x, y))) + +Base.isless(x::Kmer, y::Kmer) = @inline(cmp(x, y)) == -1 +Base.:(==)(x::Kmer, y::Kmer) = iszero(@inline cmp(x, y)) + +Base.:(==)(x::Kmer, y::BioSequence) = throw(MethodError(==, (x, y))) +Base.:(==)(x::BioSequence, y::Kmer) = throw(MethodError(==, (x, y))) + +Base.hash(x::Kmer, h::UInt) = hash(x.data, h ⊻ ksize(typeof(x))) + +# These constants are from the original implementation +@static if Sys.WORD_SIZE == 32 + # typemax(UInt32) / golden ratio + const FX_CONSTANT = 0x9e3779b9 +elseif Sys.WORD_SIZE == 64 + # typemax(UInt64) / pi + const FX_CONSTANT = 0x517cc1b727220a95 +else + error("Invalid word size") +end + +# This implementation is translated from the Rust compiler source code, +# licenced under MIT. The original source is the Firefox source code, +# also freely licensed. """ - Kmer(nts::Vararg{DNA,K}) where {K} + fx_hash(x, [h::UInt])::UInt -Construct a Kmer from a variable number `K` of DNA nucleotides. +An implementation of `FxHash`. This hash function is extremely fast, but the hashes +are of poor quality compared to Julia's default MurmurHash3. In particular: +* The value of any particular bit in the output depends only on bits in the same, + and lower positions +* The bitpattern zero hashes to zero -# Examples +However, for many applications, `FxHash` is good enough, where the higher rate of +hash collisions are offset by the faster speed. -```jldoctest -julia> Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C) -DNA 5-mer: -TTAGC -``` -""" -@inline Kmer(nt::DNA, nts::Vararg{DNA}) = DNAKmer((nt, nts...)) +The precise hash value of a given kmer is not guaranteed to be stable across minor +releases of Kmers.jl, but _is_ guaranteed to be stable across minor versions of +Julia. -""" - Kmer(nts::Vararg{RNA,K}) where {K} +# Examples +```jldoctest +julia> x = fx_hash(mer"KWQLDE"a); -Construct a Kmer from a variable number `K` of RNA nucleotides. +julia> y = fx_hash(mer"KWQLDE"a, UInt(1)); -# Examples +julia> x isa UInt +true -```jldoctest -julia> Kmer(RNA_U, RNA_U, RNA_A, RNA_G, RNA_C) -DNA 5-mer: -UUAGC +julia> x == y +false ``` """ -@inline Kmer(nt::RNA, nts::Vararg{RNA}) = RNAKmer((nt, nts...)) - +function fx_hash(x::Kmer, h::UInt) + for i in x.data + h = (bitrotate(h, 5) ⊻ i) * FX_CONSTANT + end + h +end +fx_hash(x) = fx_hash(x, zero(UInt)) """ - Kmer(seq::String) + push(kmer::Kmer{A, K}, s)::Kmer{A, K+1} -Construct a DNA or RNA kmer from a string. +Create a new kmer which is the concatenation of `kmer` and `s`. +Returns a `K+1`-mer. -!!! warning - As a convenience method, this derives the `K`, `Alphabet`, and `N` parameters - for the `Kmer{A,K,N}` type from the input string. +!!! warn + Since the output of this function is a `K+1`-mer, use of this function + in a loop may result in type-instability. -# Examples +See also: [`push_first`](@ref), [`pop`](@ref), [`shift`](@ref) +# Examples ```jldoctest -julia> Kmer("TTAGC") -DNA 5-mer: -TTAGC +julia> push(mer"UGCUGA"r, RNA_G) +RNA 7-mer: +UGCUGAG + +julia> push(mer"W"a, 'E') +AminoAcid 2-mer: +WE ``` """ -@inline function Kmer(seq::String) - seq′ = BioSequences.remove_newlines(seq) - hast = false - hasu = false - for c in seq′ - hast |= ((c == 'T') | (c == 't')) - hasu |= ((c == 'U') | (c == 'u')) - end - if (hast & hasu) | (!hast & !hasu) - throw(ArgumentError("Can't detect alphabet type from string")) +function push(kmer::Kmer, s) + bps = BioSequences.bits_per_symbol(kmer) + A = Alphabet(kmer) + newT = derive_type(Kmer{typeof(A), length(kmer) + 1}) + # If no free space in data, add new tuple + new_data = if bits_unused(typeof(kmer)) < bps + (zero(UInt), kmer.data...) + else + kmer.data end - A = ifelse(hast & !hasu, DNAAlphabet{2}, RNAAlphabet{2}) - return Kmer{A,length(seq′)}(seq′) + # leftshift_carry the new encoding in. + encoding = UInt(BioSequences.encode(A, convert(eltype(kmer), s))) + (_, new_data) = leftshift_carry(new_data, bps, encoding) + newT(unsafe, new_data) end - -""" - kmertype(::Type{Kmer{A,K}}) where {A,K} -Resolve and incomplete kmer typing, computing the N parameter of -`Kmer{A,K,N}`, given only `Kmer{A,K}`. -## Example -```julia -julia> DNAKmer{63} -Kmer{DNAAlphabet{2},63,N} where N -julia> kmertype(DNAKmer{63}) -Kmer{DNAAlphabet{2},63,2} -``` """ -@inline function kmertype(::Type{Kmer{A,K}}) where {A,K} - return Kmer{A,K,BioSequences.seq_data_len(A, K)} -end -@inline kmertype(::Type{Kmer{A,K,N}}) where {A,K,N} = Kmer{A,K,N} - -# Aliases -"Shortcut for the type `Kmer{DNAAlphabet{2},K,N}`" -const DNAKmer{K,N} = Kmer{DNAAlphabet{2},K,N} - -"Shortcut for the type `DNAKmer{27,1}`" -const DNA27mer = DNAKmer{27,1} - -"Shortcut for the type `DNAKmer{31,1}`" -const DNA31mer = DNAKmer{31,1} - -"Shortcut for the type `DNAKmer{63,2}`" -const DNA63mer = DNAKmer{63,2} + shift(kmer::Kmer{A, K}, s)::Kmer{A, K} -"Shortcut for the type `Kmer{RNAAlphabet{2},K,N}`" -const RNAKmer{K,N} = Kmer{RNAAlphabet{2},K,N} +Push `symbol` onto the end of `kmer`, and pop the first symbol in `kmer`. +Unlike `push`, this preserves the input type, and is less likely to result in +type instability. -"Shortcut for the type `RNAKmer{27,1}`" -const RNA27mer = RNAKmer{27,1} +See also: [`shift_first`](@ref), [`push`](@ref) -"Shortcut for the type `RNAKmer{31,1}`" -const RNA31mer = RNAKmer{31,1} - -"Shortcut for the type `RNAKmer{63,2}`" -const RNA63mer = RNAKmer{63,2} - -"Shortcut for the type `Kmer{AminoAcidAlphabet,K,N}`" -const AAKmer{K,N} = Kmer{AminoAcidAlphabet,K,N} - -"Shorthand for `DNAKmer{3,1}`" -const DNACodon = DNAKmer{3,1} +# Examples +```jldoctest +julia> shift(mer"TACC"d, DNA_A) +DNA 4-mer: +ACCA -"Shorthand for `RNAKmer{3,1}`" -const RNACodon = RNAKmer{3,1} +julia> shift(mer"WKYMLPIIRS"aa, 'F') +AminoAcid 10-mer: +KYMLPIIRSF +``` +""" +function shift(kmer::Kmer{A}, s) where {A} + encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + shift_encoding(kmer, encoding) +end +@inline function shift_encoding(kmer::Kmer, encoding::UInt) + isempty(kmer) && return kmer + bps = BioSequences.bits_per_symbol(kmer) + (_, new_data) = leftshift_carry(kmer.data, bps, encoding) + typeof(kmer)(unsafe, (first(new_data) & get_mask(typeof(kmer)), Base.tail(new_data)...)) +end -### -### Base Functions -### +""" + push_first(kmer::Kmer{A, K}, s)::Kmer{A, K+1} -@inline ksize(::Type{Kmer{A,K,N}}) where {A,K,N} = K -@inline nsize(::Type{Kmer{A,K,N}}) where {A,K,N} = N -@inline per_word_capacity(::Type{Kmer{A,K,N}}) where {A,K,N} = div(64, BioSequences.bits_per_symbol(A())) -@inline per_word_capacity(seq::Kmer) = per_word_capacity(typeof(seq)) -@inline capacity(::Type{Kmer{A,K,N}}) where {A,K,N} = per_word_capacity(Kmer{A,K,N}) * N -@inline capacity(seq::Kmer) = capacity(typeof(seq)) -@inline n_unused(::Type{Kmer{A,K,N}}) where {A,K,N} = capacity(Kmer{A,K,N}) - K -@inline n_unused(seq::Kmer) = n_unused(typeof(seq)) -@inline elements_in_head(::Type{Kmer{A,K,N}}) where {A,K,N} = per_word_capacity(Kmer{A,K,N}) - n_unused(Kmer{A,K,N}) -@inline elements_in_head(seq::Kmer) = elements_in_head(typeof(seq)) +Create a new kmer which is the concatenation of `s` and `kmer`. +Returns a `K+1`-mer. Similar to [`push`](@ref), but places the new symbol `s` +at the front. -""" - checkmer(::Type{Kmer{A,K,N}}) where {A,K,N} +!!! warn + Since the output of this function is a `K+1`-mer, use of this function + in a loop may result in type-instability. -Internal method - enforces good kmer type parameterisation. +See also: [`push`](@ref), [`pop`](@ref), [`shift`](@ref) -For a given Kmer{A,K,N} of length K, the number of words used to -represent it (N) should be the minimum needed to contain all K symbols, -no larger (wasteful) no smaller (just... wrong). +# Examples +```jldoctest +julia> push_first(mer"GCU"r, RNA_G) +RNA 4-mer: +GGCU -Because it is used on type parameters / variables, these conditions should be -checked at compile time, and the branches / error throws eliminated when the -parameterisation of the Kmer type is good. +julia> push_first(mer"W"a, 'E') +AminoAcid 2-mer: +EW +``` """ -@inline function checkmer(::Type{Kmer{A,K,N}}) where {A,K,N} - if K < 1 - throw(ArgumentError("Bad kmer parameterisation. K must be greater than 0.")) - end - n = BioSequences.seq_data_len(A, K) - if n !== N - # This has been significantly changed conceptually from before. Now we - # don't just check K, but *enforce* the most appropriate N for K. - throw(ArgumentError("Bad kmer parameterisation. For K = $K, N should be $n")) +function push_first(kmer::Kmer{A}, s) where {A} + bps = BioSequences.bits_per_symbol(A()) + newT = derive_type(Kmer{A, length(kmer) + 1}) + # If no free space in data, add new tuple + new_data = if bits_unused(typeof(kmer)) < bps + (zero(UInt), kmer.data...) + else + kmer.data end + encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + head = first(new_data) | left_shift(encoding, (elements_in_head(newT) - 1) * bps) + newT(unsafe, (head, tail(new_data)...)) end -@inline Base.length(x::Kmer{A,K,N}) where {A,K,N} = K -@inline Base.summary(x::Kmer{A,K,N}) where {A,K,N} = string(eltype(x), ' ', K, "-mer") +""" + shift_first(kmer::kmer, symbol)::typeof(kmer) -function Base.typemin(::Type{Kmer{A,K,N}}) where {A,K,N} - return Kmer{A,K,N}(unsafe, ntuple(i -> zero(UInt64), N)) -end +Push `symbol` onto the start of `kmer`, and pop the last symbol in `kmer`. -function Base.typemax(::Type{Kmer{A,K,N}}) where {A,K,N} - return Kmer{A,K,N}((typemax(UInt64), ntuple(i -> typemax(UInt64), N - 1)...)) -end +See also: [`shift`](@ref), [`push`](@ref) -@inline function rand_kmer_data(::Type{Kmer{A,K,N}}, ::Val{true}) where {A,K,N} - return Kmer{A,K,N}(ntuple(i -> rand(UInt64), Val{N}())) -end +# Examples +```jldoctest +julia> shift_first(mer"TACC"d, DNA_A) +DNA 4-mer: +ATAC -@inline function rand_kmer_data(::Type{Kmer{A,K,N}}, ::Val{false}) where {A,K,N} - ## All based on alphabet type of Kmer, so should constant fold. - bits_per_sym = BioSequences.bits_per_symbol(A()) - n_head = elements_in_head(Kmer{A,K,N}) - n_per_chunk = per_word_capacity(Kmer{A,K,N}) - # Construct the head. - head = zero(UInt64) - @inbounds for i in 1:n_head - bits = UInt64(BioSequences.encode(A(), rand(symbols(A())))) - head = (head << bits_per_sym) | bits - end - # And the rest of the sequence - tail = ntuple(Val{N - 1}()) do i - Base.@_inline_meta - body = zero(UInt64) - @inbounds for _ in 1:n_per_chunk - bits = UInt64(BioSequences.encode(A(), rand(symbols(A())))) - body = (body << bits_per_sym) | bits - end - return body - end - return (head, tail...) +julia> shift_first(mer"WKYMLPIIRS"aa, 'F') +AminoAcid 10-mer: +FWKYMLPIIR +``` +""" +function shift_first(kmer::Kmer{A}, s) where {A} + encoding = UInt(BioSequences.encode(A(), convert(eltype(kmer), s))) + shift_first_encoding(kmer, encoding) end +function shift_first_encoding(kmer::Kmer{A}, encoding::UInt) where {A} + isempty(kmer) && return kmer + bps = BioSequences.bits_per_symbol(A()) + (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) + head = + first(new_data) | left_shift(encoding, (elements_in_head(typeof(kmer)) - 1) * bps) + typeof(kmer)(unsafe, (head, tail(new_data)...)) +end """ - Base.rand(::Type{Kmer{A,K,N}}) where {A,K,N} - Base.rand(::Type{Kmer{A,K}}) where {A,K} + pop(kmer::Kmer{A, K})::Kmer{A, K-1} -Create a random kmer of a specified alphabet and length +Returns a new kmer with the last symbol of the input `kmer` removed. +Throws an `ArgumentError` if `kmer` is empty. + +!!! warn + Since the output of this function is a `K-1`-mer, use of this function + in a loop may result in type-instability. + +See also: [`pop_first`](@ref), [`push`](@ref), [`shift`](@ref) # Examples -```julia -julia> rand(Kmer{DNAAlphabet{2}, 3}) -BioSymbols.DNA 3-mer: -ACT +```jldoctest +julia> pop(mer"TCTGTA"d) +DNA 5-mer: +TCTGT +julia> pop(mer"QPSY"a) +AminoAcid 3-mer: +QPS + +julia> pop(mer""a) +ERROR: ArgumentError: +[...] ``` """ -@inline function Base.rand(::Type{Kmer{A,K,N}}) where {A,K,N} - checkmer(Kmer{A,K,N}) - return Kmer{A,K,N}(rand_kmer_data(Kmer{A,K,N}, BioSequences.iscomplete(A()))) +function pop(kmer::Kmer{A}) where {A} + isempty(kmer) && throw(ArgumentError("Cannot pop 0-mer")) + bps = BioSequences.bits_per_symbol(A()) + newT = derive_type(Kmer{A, length(kmer) - 1}) + (_, new_data) = rightshift_carry(kmer.data, bps, zero(UInt)) + new_data = if elements_in_head(typeof(kmer)) == 1 + tail(new_data) + else + new_data + end + newT(unsafe, new_data) end -Base.rand(::Type{Kmer{A,K}}) where {A,K} = rand(kmertype(Kmer{A,K})) +""" + pop_first(kmer::Kmer{A, K})::Kmer{A, K-1} -function Base.rand(::Type{T}, size::Integer) where {T<:Kmer} - return [rand(T) for _ in 1:size] -end +Returns a new kmer with the first symbol of the input `kmer` removed. +Throws an `ArgumentError` if `kmer` is empty. -### -### Old Mer Base Functions - not transferred to new type. -### -#@inline encoded_data_type(::Type{Mer{A,K}}) where {A,K} = UInt64 -#@inline encoded_data_type(::Type{BigMer{A,K}}) where {A,K} = UInt128 -#@inline encoded_data_type(x::AbstractMer) = encoded_data_type(typeof(x)) -#@inline encoded_data(x::AbstractMer) = reinterpret(encoded_data_type(typeof(x)), x) -#@inline ksize(::Type{T}) where {A,K,T<:AbstractMer{A,K}} = K -#@inline Base.unsigned(x::AbstractMer) = encoded_data(x) -#Base.:-(x::AbstractMer, y::Integer) = typeof(x)(encoded_data(x) - y % encoded_data_type(x)) -#Base.:+(x::AbstractMer, y::Integer) = typeof(x)(encoded_data(x) + y % encoded_data_type(x)) -#Base.:+(x::AbstractMer, y::AbstractMer) = y + x -#Alphabet(::Type{Mer{A,K} where A<:NucleicAcidAlphabet{2}}) where {K} = Any - -include("indexing.jl") - -#LongSequence{A}(x::Kmer{A,K,N}) where {A,K,N} = LongSequence{A}([nt for nt in x]) -# Convenience method so as don't need to specify A in LongSequence{A}. -BioSequences.LongSequence(x::Kmer{A,K,N}) where {A,K,N} = LongSequence{A}(x) - -include("predicates.jl") -include("counting.jl") -include("transformations.jl") - -### -### Kmer de-bruijn neighbors -### - -# TODO: Decide on this vs. old iterator pattern. I like the terseness of the code vs defining an iterator. Neither should allocate. -fw_neighbors(kmer::Kmer{A,K,N}) where {A<:DNAAlphabet,K,N} = ntuple(i -> pushlast(kmer, ACGT[i]), Val{4}()) -fw_neighbors(kmer::Kmer{A,K,N}) where {A<:RNAAlphabet,K,N} = ntuple(i -> pushlast(kmer, ACGU[i]), Val{4}()) -bw_neighbors(kmer::Kmer{A,K,N}) where {A<:DNAAlphabet,K,N} = ntuple(i -> pushfirst(kmer, ACGT[i]), Val{4}()) -bw_neighbors(kmer::Kmer{A,K,N}) where {A<:RNAAlphabet,K,N} = ntuple(i -> pushfirst(kmer, ACGU[i]), Val{4}()) - -#= -# Neighbors on a de Bruijn graph -struct KmerNeighborIterator{S<:Kmer} - x::S -end +!!! warn + Since the output of this function is a `K-1`-mer, use of this function + in a loop may result in type-instability. -""" - neighbors(kmer::S) where {S<:Kmer} +See also: [`pop`](@ref), [`push`](@ref), [`shift`](@ref) -Return an iterator through skip-mers neighboring `skipmer` on a de Bruijn graph. -""" -neighbors(kmer::Kmer) = KmerNeighborIterator{typeof(kmer)}(kmer) +# Examples +```jldoctest +julia> pop_first(mer"TCTGTA"d) +DNA 5-mer: +CTGTA -Base.length(::KmerNeighborIterator) = 4 -Base.eltype(::Type{KmerNeighborIterator{S}}) where {S<:Kmer} = S +julia> pop_first(mer"QPSY"a) +AminoAcid 3-mer: +PSY -function Base.iterate(it::KmerNeighborIterator{S}, i::UInt64 = 0) where {S<:Kmer} - if i == 4 - return nothing - else - #return S((encoded_data(it.x) << 2) | i), i + 1 - return it.x << 1, i + one(UInt64) - end -end -=# - -### -### String literals -### - -macro mer_str(seq, flag) - seq′ = BioSequences.remove_newlines(seq) - if flag == "dna" || flag == "d" - T = kmertype(DNAKmer{length(seq′)}) - return T(seq′) - elseif flag == "rna" || flag == "r" - T = kmertype(RNAKmer{length(seq′)}) - return T(seq′) - elseif flag == "aa" || flag == "a" || flag == "prot" || flag == "p" - T = kmertype(AAKmer{length(seq′)}) - return T(seq′) +julia> pop_first(mer""a) +ERROR: ArgumentError: +[...] +``` +""" +function pop_first(kmer::Kmer{A}) where {A} + isempty(kmer) && throw(ArgumentError("Cannot pop 0-mer")) + data = if elements_in_head(typeof(kmer)) == 1 + tail(kmer.data) else - error("Invalid type flag: '$(flag)'") + bps = BioSequences.bits_per_symbol(A()) + bits_used = 8 * sizeof(UInt) - (bits_unused(typeof(kmer)) + bps) + mask = left_shift(UInt(1), bits_used) - UInt(1) + (first(kmer.data) & mask, tail(kmer.data)...) end + newT = derive_type(Kmer{A, length(kmer) - 1}) + newT(unsafe, data) end -macro mer_str(seq) - seq′ = BioSequences.remove_newlines(seq) - T = kmertype(DNAKmer{length(seq′)}) - return T(seq′) +# Get a mask 0x0001111 ... masking away the unused bits of the head element +# in the UInt tuple +@inline function get_mask(T::Type{<:Kmer}) + UInt(1) << (8 * sizeof(UInt) - bits_unused(T)) - 1 end - -include("revtrans.jl") \ No newline at end of file diff --git a/src/kmer_iteration/AbstractKmerIterator.jl b/src/kmer_iteration/AbstractKmerIterator.jl deleted file mode 100644 index f320476..0000000 --- a/src/kmer_iteration/AbstractKmerIterator.jl +++ /dev/null @@ -1,36 +0,0 @@ -### -### Kmer Iteration -### -### Abstract Kmer Iterator type. -### -### This file is a part of BioJulia. -### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md - -### Type for storing the result of Kmer iteration. - -abstract type AbstractKmerIterator{T<:Kmer,S<:BioSequence} end - -@inline Base.eltype(::Type{<:AbstractKmerIterator{T,S}}) where {T,S} = Tuple{UInt64,T} - -@inline Base.IteratorSize(::Type{<:AbstractKmerIterator{Kmer{A,K,N},S}}) where {A,S<:BioSequence{A},K,N} = Base.HasLength() -@inline Base.IteratorSize(::Type{<:AbstractKmerIterator{Kmer{A,K,N},S}}) where {A,B,S<:BioSequence{B},K,N} = Base.SizeUnknown() - -@inline function Base.length(it::AbstractKmerIterator{Kmer{A,K,N},S}) where {A,K,N,S<:BioSequence{A}} - return max(0, fld(it.stop - it.start + 1 - K, step(it)) + 1) -end - -# Iteration where the Kmer and Seq alphabets match: - -## Initial iteration without state. -@inline function Base.iterate(it::AbstractKmerIterator{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - fwkmer = _build_kmer_data(Kmer{A,K,N}, it.seq, 1) - if isnothing(fwkmer) - return nothing - else - # Get the reverse. - alph = Alphabet(Kmer{A,K,N}) - rshift = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(alph) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry(_reverse(BioSequences.BitsPerSymbol(alph), _complement_bitpar(alph, fwkmer...)...), rshift) - return KmerAt{Kmer{A,K,N}}(1, Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer)), (K, fwkmer, rvkmer) - end -end \ No newline at end of file diff --git a/src/kmer_iteration/EveryCanonicalKmer.jl b/src/kmer_iteration/EveryCanonicalKmer.jl deleted file mode 100644 index 21e883b..0000000 --- a/src/kmer_iteration/EveryCanonicalKmer.jl +++ /dev/null @@ -1,122 +0,0 @@ -""" - EveryCanonicalKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -An iterator over every canonical valid overlapping `T<:Kmer` in a given longer -`BioSequence`, between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by K - 1 bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not reliably be K - 1, - and the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct EveryCanonicalKmer{T<:Kmer,S<:BioSequence{<:NucleicAcidAlphabet}} <: AbstractKmerIterator{T,S} - seq::S - start::Int - stop::Int - - function EveryCanonicalKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - T′ = kmertype(T) - checkmer(T′) # Should inline and constant fold. - return new{T′,S}(seq, start, stop) - end -end - -""" - EveryCanonicalKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -Convenience outer constructor so you don't have to specify `S` along with `T`. - -E.g. Instead of `EveryCanonicalKmer{DNACodon,typeof(s)}(s)`, you can just use `EveryCanonicalKmer{DNACodon}(s)` -""" -function EveryCanonicalKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - return EveryCanonicalKmer{T,S}(seq, start, stop) -end - -""" - EveryCanonicalKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - -Convenience outer constructor so yyou don't have to specify full `Kmer` typing. - -In order to deduce `Kmer{A,K,N}`, `A` is taken from the input `seq` type, `K` is -taken from `::Val{K}`, and `N` is deduced using `A` and `K`. - -E.g. Instead of `EveryCanonicalKmer{DNAKmer{3,1}}(s)`, or `EveryCanonicalKmer{DNACodon}(s)`, -you can use `EveryCanonicalKmer(s, Val(3))` -""" -function EveryCanonicalKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - return EveryCanonicalKmer{Kmer{A,K}}(seq, start, stop) -end - -Base.step(x::EveryCanonicalKmer) = 1 - - - -## Initial iteration without state. -@inline function Base.iterate(it::EveryCanonicalKmer{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - fwkmer = _build_kmer_data(Kmer{A,K,N}, it.seq, it.start) - if isnothing(fwkmer) - return nothing - else - rshift = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry(_reverse(BioSequences.BitsPerSymbol(A()), _complement_bitpar(A(), fwkmer...)...), rshift) - return (it.start, Kmer{A,K,N}(min(fwkmer, rvkmer))), (it.start + K - 1, fwkmer, rvkmer) - end -end - -@inline function Base.iterate(it::EveryCanonicalKmer{Kmer{A,K,N},LongSequence{A}}, state) where {A,K,N} - i, fwkmer, rvkmer = state - i += 1 - if i > it.stop - return nothing - else - bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - rshift = (64 - (n_unused(Kmer{A,K,N}) + 1) * bps) # Based on type info, should constant fold. - mask = (one(UInt64) << bps) - one(UInt64) # Based on type info, should constant fold. - - fbits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) - rbits = (BioSequences.complement_bitpar(fbits, A()) & mask) << rshift - fwkmer = leftshift_carry(fwkmer, bps, fbits) - rvkmer = rightshift_carry(rvkmer, bps, rbits) - pos = i - K + 1 - return (pos, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), (i, fwkmer, rvkmer) - end -end - -@inline Base.IteratorSize(::Type{<:EveryCanonicalKmer{Kmer{A,N,K},LongSequence{B}}}) where {A<:NucleicAcidAlphabet{2},N,K,B<:NucleicAcidAlphabet{4}} = Base.SizeUnknown() - -@inline function Base.iterate(it::EveryCanonicalKmer{Kmer{A,K,N},LongSequence{B}}, - state = (it.start - 1, 1, blank_ntuple(Kmer{A,K,N}), blank_ntuple(Kmer{A,K,N})) - ) where {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4},K,N} - - i, filled, fwkmer, rvkmer = state - i += 1 - filled -= 1 - - rshift = (64 - (n_unused(Kmer{A,K,N}) + 1) * 2) # Based on type info, should constant fold. - mask = (one(UInt64) << 2) - one(UInt64) # Based on type info, should constant fold. - - while i ≤ it.stop - @inbounds nt = reinterpret(UInt8, it.seq[i]) - @inbounds fbits = kmerbits[nt + 1] - rbits = (BioSequences.complement_bitpar(fbits, A()) & mask) << rshift - fwkmer = leftshift_carry(fwkmer, 2, fbits) - rvkmer = rightshift_carry(rvkmer, 2, rbits) - filled = ifelse(fbits == UInt64(0xff), 0, filled + 1) - if filled == K - return (i - K + 1, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), (i, filled, fwkmer, rvkmer) - end - i += 1 - end - return nothing -end \ No newline at end of file diff --git a/src/kmer_iteration/EveryKmer.jl b/src/kmer_iteration/EveryKmer.jl deleted file mode 100644 index 7885afb..0000000 --- a/src/kmer_iteration/EveryKmer.jl +++ /dev/null @@ -1,124 +0,0 @@ -### -### Kmer Iteration -### -### Iterator type over every kmer in a sequence - overlapping. -### -### This file is a part of BioJulia. -### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md - -""" - EveryKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -An iterator over every valid overlapping `T<:Kmer` in a given longer -`BioSequence` between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by K - 1 bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not reliably be K - 1, - and the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct EveryKmer{T<:Kmer,S<:BioSequence} <: AbstractKmerIterator{T,S} - seq::S - start::Int - stop::Int - - function EveryKmer{T,S}(seq::S, start::Int = firstindex(seq), stop::Int = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - T′ = kmertype(T) - checkmer(T′) # Should inline and constant fold. - return new{T′,S}(seq, start, stop) - end -end - -""" - EveryKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -Convenience outer constructor so you don't have to specify `S` along with `T`. - -E.g. Instead of `EveryKmer{DNACodon,typeof(s)}(s)`, you can just use `EveryKmer{DNACodon}(s)` -""" -function EveryKmer{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - return EveryKmer{T,S}(seq, start, stop) -end - -""" - EveryKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - -Convenience outer constructor so yyou don't have to specify full `Kmer` typing. - -In order to deduce `Kmer{A,K,N}`, `A` is taken from the input `seq` type, `K` is -taken from `::Val{K}`, and `N` is deduced using `A` and `K`. - -E.g. Instead of `EveryKmer{DNAKmer{3,1}}(s)`, or `EveryKmer{DNACodon}(s)`, -you can use `EveryKmer(s, Val(3))` -""" -function EveryKmer(seq::BioSequence{A}, ::Val{K}, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - return EveryKmer{Kmer{A,K}}(seq, start, stop) -end - -Base.step(x::EveryKmer) = 1 - -## Initial iteration without state. -@inline function Base.iterate(it::EveryKmer{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - kmer = _build_kmer_data(Kmer{A,K,N}, it.seq, 1) - if isnothing(kmer) - return nothing - else - return (1, Kmer{A,K,N}(kmer)), (K, kmer) - end -end - -@inline function Base.iterate(it::EveryKmer{Kmer{A,K,N},LongSequence{A}}, state) where {A,K,N} - i, fwkmer = state - i += 1 - if i > it.stop - return nothing - else - bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - bits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) - kmer = leftshift_carry(fwkmer, bps, bits) - pos = i - K + 1 - return (pos, Kmer{A,K,N}(kmer)), (i, kmer) - end -end - -## Special case where iterating over 2-Bit encoded kmers in a 4-Bit encoded sequence, -## behaviour is to produce kmers by skipping over the ambiguous sites. - -const kmerbits = (UInt64(0xff), UInt64(0x00), UInt64(0x01), UInt64(0xff), - UInt64(0x02), UInt64(0xff), UInt64(0xff), UInt64(0xff), - UInt64(0x03), UInt64(0xff), UInt64(0xff), UInt64(0xff), - UInt64(0xff), UInt64(0xff), UInt64(0xff), UInt64(0xff)) - -@inline Base.IteratorSize(::Type{<:EveryKmer{Kmer{A,N,K},S}}) where {A<:NucleicAcidAlphabet{2},N,K,B<:NucleicAcidAlphabet{4},S<:BioSequence{B}} = Base.SizeUnknown() - -@inline function Base.iterate(it::EveryKmer{Kmer{A,K,N},S}, - state = (it.start - 1, 1, blank_ntuple(Kmer{A,K,N})) - ) where {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4},S<:BioSequence{B},K,N} - - i, filled, fwkmer = state - i += 1 - filled -= 1 - - while i ≤ it.stop - @inbounds nt = reinterpret(UInt8, it.seq[i]) - @inbounds fbits = kmerbits[nt + 1] - fwkmer = leftshift_carry(fwkmer, 2, fbits) - filled = ifelse(fbits == UInt64(0xff), 0, filled + 1) - if filled == K - return (i - K + 1, Kmer{A,K,N}(fwkmer)), (i, filled, fwkmer) - end - i += 1 - end - return nothing -end \ No newline at end of file diff --git a/src/kmer_iteration/SpacedCanonicalKmers.jl b/src/kmer_iteration/SpacedCanonicalKmers.jl deleted file mode 100644 index c650cf9..0000000 --- a/src/kmer_iteration/SpacedCanonicalKmers.jl +++ /dev/null @@ -1,133 +0,0 @@ - -""" - SpacedCanonicalKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} - -An iterator over every valid `T<:Kmer` separated by a `step` parameter, in a given -longer `BioSequence`, between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by `max(0, K - step)` bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not consistent, but the - reading frame will be preserved. - In addition, the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct SpacedCanonicalKmers{T<:Kmer,S<:BioSequence} <: AbstractKmerIterator{T,S} - seq::S - start::Int - step::Int - stop::Int - filled::Int # This is cached for speed - increment::Int # This is cached for speed - - function SpacedCanonicalKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} - T′ = kmertype(T) - checkmer(T′) # Should inline and constant fold. - if step <= 1 - throw(ArgumentError("step size must be greater than 1")) - end - filled = max(0, ksize(T′) - step) - increment = max(1, step - ksize(T′) + 1) - return new{T′,S}(seq, start, step, stop, filled, increment) - end -end - -""" - SpacedCanonicalKmers{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -Convenience outer constructor so you don't have to specify `S` along with `T`. - -E.g. Instead of `SpacedCanonicalKmers{DNACodon,typeof(s)}(s, 3)`, you can just use `SpacedCanonicalKmers{DNACodon}(s, 3)` -""" -function SpacedCanonicalKmers{T}(seq::S, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - return SpacedCanonicalKmers{T,S}(seq, step, start, stop) -end - -""" - SpacedCanonicalKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - -Convenience outer constructor so yyou don't have to specify full `Kmer` typing. - -In order to deduce `Kmer{A,K,N}`, `A` is taken from the input `seq` type, `K` is -taken from `::Val{K}`, and `N` is deduced using `A` and `K`. - -E.g. Instead of `SpacedCanonicalKmers{DNAKmer{3,1}}(s, 3)`, or `SpacedCanonicalKmers{DNACodon}(s, 3)`, -you can use `SpacedCanonicalKmers(s, Val(3), 3)` -""" -function SpacedCanonicalKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - return SpacedCanonicalKmers{Kmer{A,K}}(seq, step, start, stop) -end - -Base.step(x::SpacedCanonicalKmers) = x.step - -@inline function Base.iterate(it::SpacedCanonicalKmers{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - fwkmer = _build_kmer_data(Kmer{A,K,N}, it.seq, 1) - if isnothing(fwkmer) - return nothing - else - rshift = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(A()) # Based on alphabet type, should constant fold. - rvkmer = rightshift_carry(_reverse(BioSequences.BitsPerSymbol(A()), _complement_bitpar(A(), fwkmer...)...), rshift) - return (1, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), (K, fwkmer, rvkmer) - end -end - -@inline function Base.iterate(it::SpacedCanonicalKmers{Kmer{A,K,N},LongSequence{A}}, state) where {A,K,N} - i, fwkmer, rvkmer = state - filled = it.filled - i += it.increment - - for _ in filled:K-1 - if i > it.stop - return nothing - else - bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - rshift = (64 - (n_unused(Kmer{A,K,N}) + 1) * bps) # Based on type info, should constant fold. - mask = (one(UInt64) << bps) - one(UInt64) # Based on type info, should constant fold. - fbits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) - rbits = (BioSequences.complement_bitpar(fbits, A()) & mask) << rshift - fwkmer = leftshift_carry(fwkmer, bps, fbits) - rvkmer = rightshift_carry(rvkmer, bps, rbits) - i += 1 - end - end - pos = i - K + 1 - return (pos, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), (i, fwkmer, rvkmer) -end - -@inline function Base.iterate(it::SpacedCanonicalKmers{Kmer{A,K,N},LongSequence{B}}, state = (it.start - it.increment, 1, 0, blank_ntuple(Kmer{A,K,N}), blank_ntuple(Kmer{A,K,N})) - ) where {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4},K,N} - i, pos, filled, fwkmer, rvkmer = state - i += it.increment - - while i ≤ it.stop - nt = reinterpret(UInt8, @inbounds getindex(it.seq, i)) - @inbounds fbits = UInt64(kmerbits[nt + 1]) - rbits = ~fbits & typeof(fbits)(0x03) - if fbits == 0xff # ambiguous - filled = 0 - # Find the beginning of next possible kmer after i - pos = i + it.step - Core.Intrinsics.urem_int(i - pos, it.step) - i = pos - 1 - else - filled += 1 - fwkmer = leftshift_carry(fwkmer, 2, fbits) - rvkmer = rightshift_carry(rvkmer, 2, UInt64(rbits) << (62 - (64N - 2K))) - end - if filled == K - state = (i, i - K + 1 + it.step, it.filled, fwkmer, rvkmer) - return (pos, min(Kmer{A,K,N}(fwkmer), Kmer{A,K,N}(rvkmer))), state - end - i += 1 - end - return nothing -end \ No newline at end of file diff --git a/src/kmer_iteration/SpacedKmers.jl b/src/kmer_iteration/SpacedKmers.jl deleted file mode 100644 index 18461ee..0000000 --- a/src/kmer_iteration/SpacedKmers.jl +++ /dev/null @@ -1,127 +0,0 @@ - -""" - SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} - -An iterator over every valid `T<:Kmer` separated by a `step` parameter, in a given -longer `BioSequence`, between a `start` and `stop` position. - -!!! note - Typically, the alphabet of the Kmer type matches the alphabet of the input - BioSequence. In these cases, the iterator will have `Base.IteratorSize` of - `Base.HasLength`, and successive kmers produced by the iterator will overlap - by `max(0, K - step)` bases. - - However, in the specific case of iterating over kmers in a DNA or RNA sequence, you - may iterate over a Kmers where the alphabet is a NucleicAcidAlphabet{2}, but - the input BioSequence has a NucleicAcidAlphabet{4}. - - In this case then the iterator will skip over positions in the BioSequence - with characters that are not supported by the Kmer type's NucleicAcidAlphabet{2}. - - As a result, the overlap between successive kmers may not consistent, but the - reading frame will be preserved. - In addition, the iterator will have `Base.IteratorSize` of `Base.SizeUnknown`. -""" -struct SpacedKmers{T<:Kmer,S<:BioSequence} <: AbstractKmerIterator{T,S} - seq::S - start::Int - step::Int - stop::Int - filled::Int # This is cached for speed - increment::Int # This is cached for speed - - function SpacedKmers{T,S}(seq::S, step::Int, start::Int, stop::Int) where {T<:Kmer,S<:BioSequence} - T′ = kmertype(T) - checkmer(T′) # Should inline and constant fold. - if step <= 1 - throw(ArgumentError("step size must be greater than 1")) - end - filled = max(0, ksize(T′) - step) - increment = max(1, step - ksize(T′) + 1) - return new{T′,S}(seq, start, step, stop, filled, increment) - end -end - -""" - SpacedKmers{T}(seq::S, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - -Convenience outer constructor so you don't have to specify `S` along with `T`. - -E.g. Instead of `SpacedKmers{DNACodon,typeof(s)}(s, 3)`, you can just use `SpacedKmers{DNACodon}(s, 3)` -""" -function SpacedKmers{T}(seq::S, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {T<:Kmer,S<:BioSequence} - return SpacedKmers{T,S}(seq, step, start, stop) -end - -""" - SpacedKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - -Convenience outer constructor so yyou don't have to specify full `Kmer` typing. - -In order to deduce `Kmer{A,K,N}`, `A` is taken from the input `seq` type, `K` is -taken from `::Val{K}`, and `N` is deduced using `A` and `K`. - -E.g. Instead of `SpacedKmers{DNAKmer{3,1}}(s, 3)`, or `SpacedKmers{DNACodon}(s, 3)`, -you can use `SpacedKmers(s, Val(3), 3)` -""" -function SpacedKmers(seq::BioSequence{A}, ::Val{K}, step::Int, start = firstindex(seq), stop = lastindex(seq)) where {A,K} - return SpacedKmers{Kmer{A,K}}(seq, step, start, stop) -end - -Base.step(x::SpacedKmers) = x.step - -@inline function Base.iterate(it::SpacedKmers{Kmer{A,K,N},LongSequence{A}}) where {A,K,N} - kmer = _build_kmer_data(Kmer{A,K,N}, it.seq, 1) - if isnothing(kmer) - return nothing - else - # Get the reverse. - alph = Alphabet(Kmer{A,K,N}) - return (1, Kmer{A,K,N}(kmer)), (K, kmer) - end -end - -@inline function Base.iterate(it::SpacedKmers{Kmer{A,K,N},LongSequence{A}}, state) where {A,K,N} - i, kmer = state - filled = it.filled - i += it.increment - - for _ in filled:K-1 - if i > it.stop - return nothing - else - bps = BioSequences.bits_per_symbol(A()) # Based on type info, should constant fold. - bits = UInt64(BioSequences.extract_encoded_element(it.seq, i)) - kmer = leftshift_carry(kmer, bps, bits) - i += 1 - end - end - pos = i - K + 1 - return (pos, Kmer{A,K,N}(kmer)), (i, kmer) -end - -@inline function Base.iterate(it::SpacedKmers{Kmer{A,K,N},LongSequence{B}}, state = (it.start - it.increment, 1, 0, blank_ntuple(Kmer{A,K,N})) - ) where {A<:NucleicAcidAlphabet{2},B<:NucleicAcidAlphabet{4},K,N} - i, pos, filled, kmer = state - i += it.increment - - while i ≤ it.stop - nt = reinterpret(UInt8, @inbounds getindex(it.seq, i)) - @inbounds bits = UInt64(kmerbits[nt + 1]) - if bits == 0xff # ambiguous - filled = 0 - # Find the beginning of next possible kmer after i - pos = i + it.step - Core.Intrinsics.urem_int(i - pos, it.step) - i = pos - 1 - else - filled += 1 - kmer = leftshift_carry(kmer, 2, bits) - end - if filled == K - state = (i, i - K + 1 + it.step, it.filled, kmer) - return (pos, Kmer{A,K,N}(kmer)), state - end - i += 1 - end - return nothing -end diff --git a/src/predicates.jl b/src/predicates.jl deleted file mode 100644 index 86258e6..0000000 --- a/src/predicates.jl +++ /dev/null @@ -1,11 +0,0 @@ -### -### Mer specific specializations of src/biosequence/predicates.jl -### - -Base.cmp(x::T, y::T) where {T<:Kmer} = cmp(x.data, y.data) -Base.:(==)(x::T, y::T) where {T<:Kmer} = x.data == y.data -Base.isless(x::T, y::T) where {T<:Kmer} = isless(x.data, y.data) - -# TODO: Ensure this is the right way to go. -# See https://github.com/BioJulia/BioSequences.jl/pull/121#discussion_r475234270 -Base.hash(x::Kmer{A,K,N}, h::UInt) where {A,K,N} = hash(x.data, h ⊻ K) \ No newline at end of file diff --git a/src/revtrans.jl b/src/revtrans.jl index e4ebedf..ed4c381 100644 --- a/src/revtrans.jl +++ b/src/revtrans.jl @@ -18,14 +18,12 @@ julia> Set(CodonSet(v)) == Set(v) true julia> union(CodonSet(v), CodonSet([mer"GAG"r])) -Kmers.CodonSet with 4 elements: +CodonSet with 4 elements: GAG GGA UAG UUU ``` - -See also: `push` """ struct CodonSet <: AbstractSet{RNACodon} x::UInt64 @@ -33,11 +31,11 @@ struct CodonSet <: AbstractSet{RNACodon} CodonSet(x::UInt64, ::Unsafe) = new(x) end CodonSet() = CodonSet(UInt64(0), Unsafe()) -CodonSet(itr) = foldl(push, itr, init=CodonSet()) +CodonSet(itr) = foldl(push, itr; init=CodonSet()) function Base.iterate(x::CodonSet, s::UInt64=x.x) - codon = RNACodon((trailing_zeros(s) % UInt64,)) - iszero(s) ? nothing : (codon, s & (s-1)) + codon = RNACodon(unsafe, (trailing_zeros(s) % UInt64,)) + iszero(s) ? nothing : (codon, s & (s - 1)) end function push(s::CodonSet, x::RNACodon) @@ -52,8 +50,8 @@ Base.filter(f, s::CodonSet) = CodonSet(Iterators.filter(f, s)) Base.setdiff(a::CodonSet, b::Vararg{CodonSet}) = CodonSet(a.x & ~(union(b...).x), Unsafe()) for (name, f) in [(:union, |), (:intersect, &), (:symdiff, ⊻)] - @eval function Base.$(name)(a::CodonSet, b::Vararg{CodonSet}) - CodonSet(mapreduce(i -> i.x, $f, b, init=a.x), Unsafe()) + @eval function Base.$(name)(a::CodonSet, b::Vararg{CodonSet}) + CodonSet(mapreduce(i -> i.x, $f, b; init=a.x), Unsafe()) end end @@ -76,7 +74,7 @@ inverse of the mapping through `GeneticCode` julia> code = ReverseGeneticCode(BioSequences.candidate_division_sr1_genetic_code); julia> code[AA_E] -Kmers.CodonSet with 2 elements: +CodonSet with 2 elements: GAA GAG @@ -89,17 +87,17 @@ See also: [`reverse_translate`](@ref) """ struct ReverseGeneticCode <: AbstractDict{AminoAcid, CodonSet} name::String - sets::NTuple{N_AA-1, CodonSet} + sets::NTuple{N_AA - 1, CodonSet} end function ReverseGeneticCode(x::BioSequences.GeneticCode) ind(aa::AminoAcid) = reinterpret(UInt8, aa) + 1 - sets = fill(CodonSet(), N_AA-1) + sets = fill(CodonSet(), N_AA - 1) x_set = CodonSet() for i in Int64(0):Int64(63) aa = x.tbl[i + 1] - codon = RNACodon((i % UInt64,)) + codon = RNACodon(unsafe, (i % UInt64,)) sets[ind(aa)] = push(sets[ind(aa)], codon) if aa !== AA_Term x_set = push(x_set, codon) @@ -122,9 +120,7 @@ function ReverseGeneticCode(x::BioSequences.GeneticCode) ReverseGeneticCode(x.name, Tuple(sets)) end -const rev_standard_genetic_code = ReverseGeneticCode( - BioSequences.standard_genetic_code -) +const rev_standard_genetic_code = ReverseGeneticCode(BioSequences.standard_genetic_code) function Base.getindex(s::ReverseGeneticCode, a::AminoAcid) if reinterpret(UInt8, a) > (N_AA - 2) # cannot translate gap @@ -136,21 +132,29 @@ end Base.length(c::ReverseGeneticCode) = length(c.sets) function Base.iterate(c::ReverseGeneticCode, s=1) s > length(c.sets) && return nothing - return (reinterpret(AminoAcid, (s-1)%UInt8) => c.sets[s], s+1) + return (reinterpret(AminoAcid, (s - 1) % UInt8) => c.sets[s], s + 1) end """ - reverse_translate!(v::Vector{CodonSet}, s::AASeq code=rev_standard_genetic_code) + reverse_translate!(v::Vector{CodonSet}, s::AASeq code=rev_standard_genetic_code) -> v Reverse-translates `s` under the reverse genetic code `code`, putting the result in `v`. See also: [`reverse_translate`](@ref) + +# Examples: +```jldoctest +julia> v = CodonSet[]; + +julia> reverse_translate!(v, aa"KWCL") +4-element Vector{CodonSet}: + CodonSet(0x0000000000000005) + CodonSet(0x0400000000000000) + CodonSet(0x0a00000000000000) + CodonSet(0x50000000f0000000) +``` """ -function reverse_translate!( - v::Vector{CodonSet}, - seq::AASeq, - code=rev_standard_genetic_code -) +function reverse_translate!(v::Vector{CodonSet}, seq::AASeq, code=rev_standard_genetic_code) resize!(v, length(seq)) @inbounds for i in eachindex(v) v[i] = code[seq[i]] @@ -168,21 +172,20 @@ If `s` is an `AASeq`, return `Vector{CodonSet}`. # Examples ```jldoctest julia> reverse_translate(AA_W) -Kmers.CodonSet with 1 element: +CodonSet with 1 element: UGG julia> v = reverse_translate(aa"MMLVQ"); julia> typeof(v) -Vector{Kmers.CodonSet} +Vector{CodonSet} (alias for Array{CodonSet, 1}) julia> v[4] -Kmers.CodonSet with 4 elements: +CodonSet with 4 elements: GUA GUC GUG GUU -[...] ``` See also: [`reverse_translate!`](@ref), [`ReverseGeneticCode`](@ref) diff --git a/src/transformations.jl b/src/transformations.jl index 6aa3456..7681422 100644 --- a/src/transformations.jl +++ b/src/transformations.jl @@ -1,221 +1,104 @@ - -# Bit-parallel element nucleotide complementation -@inline function _complement_bitpar(a::A, head::UInt64, tail...) where {A<:NucleicAcidAlphabet} - return (BioSequences.complement_bitpar(head, A()), _complement_bitpar(a, tail...)...) -end - -@inline _complement_bitpar(a::A) where {A<:NucleicAcidAlphabet} = () - -@inline function pushfirst(x::Kmer{A,K,N}, nt) where {A,K,N} - ntbits = UInt64(BioSequences.encode(A(), nt)) << (62 - (64N - 2K)) - #ntbits = UInt64(@inbounds BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) << (62 - (64N - 2K)) - return Kmer{A,K,N}(_rightshift_carry(2, ntbits, x.data...)) -end - -@inline function pushlast(x::Kmer{A,K,N}, nt) where {A,K,N} - ntbits = UInt64(BioSequences.encode(A(), nt)) - #ntbits = UInt64(@inbounds BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) - _, newbits = _leftshift_carry(2, ntbits, x.data...) - return Kmer{A,K,N}(newbits) -end - - -### -### Transformation methods -### - -""" - complement(seq::T) where {T<:Kmer} - -Return a kmer's complement kmer. - -# Examples - -```jldoctest -julia> complement(Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)) -DNA 5-mer: -AATCG -``` -""" -@inline function BioSequences.complement(seq::T) where {T<:Kmer} - return T(_complement_bitpar(Alphabet(seq), seq.data...)) -end - -""" - reverse(seq::Kmer{A,K,N}) where {A,K,N} - -Return a kmer that is the reverse of the input kmer. - -# Examples - -```jldoctest -julia> reverse(Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)) -DNA 5-mer: -CGATT -``` -""" -@inline function Base.reverse(seq::Kmer{A,K,N}) where {A,K,N} - rdata = _reverse(BioSequences.BitsPerSymbol(seq), seq.data...) - # rshift should constant-fold. - rshift = n_unused(Kmer{A,K,N}) * BioSequences.bits_per_symbol(A()) - return Kmer{A,K,N}(rightshift_carry(rdata, rshift)) # based on only 2 bit alphabet. -end - -""" - reverse_complement(seq::Kmer) - -Return the kmer that is the reverse complement of the input kmer. - -# Examples - -```jldoctest -julia> reverse_complement(Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)) -DNA 5-mer: -GCTAA -``` -""" -@inline function BioSequences.reverse_complement(seq::Kmer{A,K,N}) where {A,K,N} - return complement(reverse(seq)) +function Base.reverse(x::Kmer) + # ( ABC, DEFG) # reverse each element + # (CBA , GFED) # reverse elements + # (GFED, CBA ) # rightshift carry a zero + # ( GFE, DBCA) # final result + Bps = BioSequences.BitsPerSymbol(Alphabet(x)) + data = map(i -> BioSequences.reversebits(i, Bps), reverse(x.data)) + (_, data) = rightshift_carry(data, bits_unused(typeof(x)), zero(UInt)) + typeof(x)(unsafe, data) end -#= -@inline function reverse_complement2(seq::Kmer{A,K,N}) where {A,K,N} - f = x -> complement_bitpar(x, A()) - rdata = _reverse(f, BioSequences.BitsPerSymbol(seq), seq.data...) - return Kmer{A,K,N}(rightshift_carry(rdata, 64N - 2K)) +# For this method, we don't need to mask the unused bits, because the complement of +# 0x0 == DNA_Gap is still DNA_Gap +function BioSequences.complement(x::Kmer{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}}) + isempty(x) && return x + data = map(i -> BioSequences.complement_bitpar(i, Alphabet(x)), x.data) + typeof(x)(unsafe, data) end -=# - -""" - BioSequences.canonical(seq::Kmer{A,K,N}) where {A,K,N} -Return the canonical sequence of `seq`. - -A canonical sequence is the numerical lesser of a kmer and its reverse complement. -This is useful in hashing/counting sequences in data that is not strand specific, -and thus observing the short sequence is equivalent to observing its reverse complement. - -# Examples - -```jldoctest -julia> canonical(Kmer(DNA_T, DNA_T, DNA_A, DNA_G, DNA_C)) -DNA 5-mer: -GCTAA -``` -""" -@inline function BioSequences.canonical(seq::Kmer{A,K,N}) where {A,K,N} - if N < 4 - return min(seq, reverse_complement(seq)) - else - return iscanonical(seq) ? seq : reverse_complement(seq) - end +# For this method we do need to mask unused bits, unlike above +function BioSequences.complement(x::Kmer{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}) + isempty(x) && return x + data = map(i -> BioSequences.complement_bitpar(i, Alphabet(x)), x.data) + typeof(x)(unsafe, ((first(data) & get_mask(typeof(x))), Base.tail(data)...)) end -### -### Old Mer specific specializations of src/biosequence/transformations.jl -### - not currently transferred to new type. - -# TODO: Sort this and decide on transferring to new NTuple based kmers or no. - -#= -function swap(x::T, i, j) where {T<:AbstractMer} - i = 2 * length(x) - 2i - j = 2 * length(x) - 2j - b = encoded_data(x) - x = ((b >> i) ⊻ (b >> j)) & encoded_data_type(x)(0x03) - return T(b ⊻ ((x << i) | (x << j))) +# Generic fallback +function BioSequences.complement(x::Kmer{<:NucleicAcidAlphabet}) + typeof(x)((complement(i) for i in x)) end - - -function Random.shuffle(x::T) where {T<:AbstractMer} - # Fisher-Yates shuffle for mers. - j = lastindex(x) - for i in firstindex(x):(j - 1) - j′ = rand(i:j) - x = swap(x, i, j′) - end - return x +# TODO: Should this be the generic BioSequence def in BioSequences.jl? +function BioSequences.reverse_complement(x::Kmer) + @inline(reverse(@inline(complement(x)))) end -=# -throw_translate_err(K) = error("Cannot translate Kmer of size $K not divisible by 3") - -@inline function setup_translate(seq::Kmer{<:NucleicAcidAlphabet, K}) where K - naa, rem = divrem(K, 3) - iszero(rem) || throw_translate_err(K) - kmertype(AAKmer{naa}) +function BioSequences.canonical(x::Kmer) + rc = reverse_complement(x) + ifelse(x < rc, x, rc) end -# This sets the first amino acid to methionine, returning the data tuple -@inline function set_methionine_data(data::Tuple{Vararg{UInt64}}, ::Val{K}) where K - offset = ((K - 1) * 8) & 63 - mask = ~(UInt64(0xff) << offset) # mask off existing AA in pos 1 - addition = UInt64(0x0c) << offset # 0x0c is encoded methionine - chunk, rest... = data - chunk = (chunk & mask) | addition - return (chunk, rest...) -end +BioSequences.iscanonical(x::Kmer) = x <= reverse_complement(x) function BioSequences.translate( - seq::Union{RNAKmer, DNAKmer}; - code=BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool = true, # a noop for this method - alternative_start::Bool = false -) - T = setup_translate(seq) - data = blank_ntuple(T) - for i in 1:ksize(T) - a = seq[3*i - 2] - b = seq[3*i - 1] - c = seq[3*i - 0] + seq::Kmer{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}; + code::BioSequences.GeneticCode=BioSequences.standard_genetic_code, + allow_ambiguous_codons::Bool=true, # noop in this method + alternative_start::Bool=false, +) + iszero(ksize(typeof(seq))) && return mer""a + n_aa, remainder = divrem(length(seq), 3) + iszero(remainder) || + error("LongRNA length is not divisible by three. Cannot translate.") + N = n_coding_elements(Kmer{AminoAcidAlphabet, n_aa}) + T = Kmer{AminoAcidAlphabet, n_aa, N} + data = zero_tuple(T) + # In the next two lines: If alternative_start, we shift in the encoding of M + # to first place, then we skip the first 3 nucleotides + (_, data) = leftshift_carry(data, 8, UInt(0x0c) * alternative_start) + @inbounds for i in (1 + (3 * alternative_start)):n_aa + a = seq[3i - 2] + b = seq[3i - 1] + c = seq[3i - 0] codon = BioSequences.unambiguous_codon(a, b, c) aa = code[codon] - # Next line is equivalent to encode, but without checking. - # We assume genetic codes do not code to invalid data. - enc_data = reinterpret(UInt8, aa) % UInt64 - data = leftshift_carry(data, 8, enc_data) + carry = UInt(reinterpret(UInt8, aa)) + (_, data) = + leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) end - # This is probably not needed for kmers, but kept for compatibility. - # It does slightly slow down translation, even when not taken. - if alternative_start && !iszero(ksize(T)) - data = set_methionine_data(data, Val(ksize(T))) - end - return T(data) + T(unsafe, data) end -# See the function above for comments, or the equivalent function -# in BioSequences function BioSequences.translate( - seq::Kmer{<:NucleicAcidAlphabet}; - code=BioSequences.standard_genetic_code, - allow_ambiguous_codons::Bool = true, - alternative_start::Bool = false -) - T = setup_translate(seq) - data = blank_ntuple(T) - for i in 1:ksize(T) - a = reinterpret(RNA, seq[3*i - 2]) - b = reinterpret(RNA, seq[3*i - 1]) - c = reinterpret(RNA, seq[3*i - 0]) - aa = if BioSequences.isambiguous(a) | BioSequences.isambiguous(b) | BioSequences.isambiguous(c) - aa_ = BioSequences.try_translate_ambiguous_codon(code, a, b, c) - if aa_ === nothing - if allow_ambiguous_codons - aa_ = AA_X - else - error("codon ", a, b, c, " cannot be unambiguously translated") - end - end - aa_ - else + seq::Kmer{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}}; + code::BioSequences.GeneticCode=BioSequences.standard_genetic_code, + allow_ambiguous_codons::Bool=true, + alternative_start::Bool=false, +) + n_aa, remainder = divrem(length(seq), 3) + iszero(remainder) || + error("LongRNA length is not divisible by three. Cannot translate.") + N = n_coding_elements(Kmer{AminoAcidAlphabet, n_aa}) + T = Kmer{AminoAcidAlphabet, n_aa, N} + data = zero_tuple(T) + # In the next two lines: If alternative_start, we shift in the encoding of M + # to first place, then we skip the first 3 nucleotides + (_, data) = leftshift_carry(data, 8, UInt(0x0c) * alternative_start) + @inbounds for i in (1 + (3 * alternative_start)):n_aa + a = reinterpret(RNA, seq[3i - 2]) + b = reinterpret(RNA, seq[3i - 1]) + c = reinterpret(RNA, seq[3i - 0]) + aa = if isgap(a) | isgap(b) | isgap(c) + error("Cannot translate nucleotide sequences with gaps.") + elseif iscertain(a) & iscertain(b) & iscertain(c) code[BioSequences.unambiguous_codon(a, b, c)] + else + BioSequences.try_translate_ambiguous_codon(code, a, b, c, allow_ambiguous_codons) end - enc_data = reinterpret(UInt8, aa) % UInt64 - data = leftshift_carry(data, 8, enc_data) - end - if alternative_start && !iszero(ksize(T)) - data = set_methionine_data(data, Val(ksize(T))) + carry = UInt(reinterpret(UInt8, aa)) + (_, data) = + leftshift_carry(data, BioSequences.bits_per_symbol(AminoAcidAlphabet()), carry) end - return T(data) + T(unsafe, data) end diff --git a/src/tuple_bitflipping.jl b/src/tuple_bitflipping.jl index 4476956..974bf55 100644 --- a/src/tuple_bitflipping.jl +++ b/src/tuple_bitflipping.jl @@ -1,72 +1,50 @@ - -# TODO: this should end up in BioSequences.jl? - -"Extract the element stored in a packed bitarray referred to by bidx." -@inline function BioSequences.extract_encoded_element(bidx::BioSequences.BitIndex{N,W}, data::NTuple{n,W}) where {N,n,W} - @inbounds chunk = data[BioSequences.index(bidx)] - offchunk = chunk >> (BioSequences.bitwidth(W) - N - BioSequences.offset(bidx)) - return offchunk & BioSequences.bitmask(bidx) -end - - - -""" - _cliphead(by::Integer, head::UInt64, tail...) - -A method used to mask the first `by` MSB's in `head`, before catting it with -tail to return a NTuple. - -This is used internally to mask the first `by` bits in the first word of a -NTuple of UInt64's. - -Notably it's used when constructing a Kmer from an existing NTuple of UInt64 -""" -@inline function _cliphead(by::Integer, head::UInt64, tail...) - return (head & (typemax(UInt64) >> by), tail...) -end - -#= -rightshift_carry & leftshift_carry - -These methods are micro-optimised (or should be!!!) for shifting the bits in -an NTuple of unsigned integers, carrying the bits "shifted off" one word -over to the next word. The carry can also be "seeded" so as other methods like -pushfirst and pushlast can be efficiently implemented without duplication of code -or less efficient implementations that first shift and then insert an element. -=# - -@inline function rightshift_carry(x::NTuple{N,UInt64}, nbits::Integer, prevcarry = zero(UInt64)) where {N} - return _rightshift_carry(nbits, prevcarry, x...) +# These compile to raw CPU instructions and are therefore more +# efficient than simply using << and >>> +@inline function left_shift(x::Unsigned, n::Integer) + x << (n & ((sizeof(x) * 8) - 1)) end -@inline function _rightshift_carry(nbits::Integer, carry::UInt64, head::UInt64, tail...) - return ((head >> nbits) | carry, _rightshift_carry(nbits, (head & ((one(UInt64) << nbits) - 1)) << (64 - nbits), tail...)...) +@inline function right_shift(x::Unsigned, n::Integer) + x >>> (n & ((sizeof(x) * 8) - 1)) end -@inline _rightshift_carry(nbits::Integer, carry::UInt64) = () - -@inline function leftshift_carry(x::NTuple{N,UInt64}, nbits::Integer, prevcarry::UInt64 = zero(UInt64)) where {N} - _, newbits = _leftshift_carry(nbits, prevcarry, x...) - return newbits +# When the UInt is shifted n bits, these are the bits +# that are shifted away (carried over) +@inline function left_carry(x::Unsigned, n::Integer) + right_shift(x, 8 * sizeof(x) - n) end -@inline function _leftshift_carry(nbits::Integer, prevcarry::UInt64, head::UInt64, tail...) - carry, newtail = _leftshift_carry(nbits, prevcarry, tail...) - return head >> (64 - nbits), ((head << nbits) | carry, newtail...) +@inline function right_carry(x::Unsigned, n::Integer) + left_shift(x, 8 * sizeof(x) - n) end -@inline _leftshift_carry(nbits::Integer, prevcarry::UInt64) = prevcarry, () - -@inline function _reverse(bpe::BioSequences.BitsPerSymbol{N}, head::UInt64, tail...) where {N} - return (_reverse(bpe, tail...)..., BioSequences.reversebits(head, bpe)) +# Shift a tuple left nbits, carry over bits between tuple elements, and OR +# the `carry` argument to the right side of the resulting tuple. +# Returns (new_carry, new_tuple) +@inline function leftshift_carry( + x::Tuple{Vararg{T}}, + nbits::Integer, + carry::T, +) where {T <: Unsigned} + isempty(x) && return x + (new_carry, new_tail) = leftshift_carry(tail(x), nbits, carry) + new_head = left_shift(first(x), nbits) | new_carry + (left_carry(first(x), nbits), (new_head, new_tail...)) end -@inline _reverse(::BioSequences.BitsPerSymbol{N}) where {N} = () - -#= -@inline function _reverse(f::F, bpe::BioSequences.BitsPerSymbol{N}, head::UInt64, tail...) where {N,F<:Function} - return (_reverse(f, bpe, tail...)..., f(reversebits(head, bpe))) +@inline function rightshift_carry( + x::Tuple{Vararg{T}}, + nbits::Integer, + carry::T, +) where {T <: Unsigned} + isempty(x) && return x + new_head = right_shift(first(x), nbits) | right_carry(carry, nbits) + mask = left_shift(UInt(1), nbits) - 1 + tail_carry = first(x) & mask + (new_carry, new_tail) = rightshift_carry(tail(x), nbits, tail_carry) + (new_carry, (new_head, new_tail...)) end -@inline _reverse(f::F, ::BioSequences.BitsPerSymbol{N}) where {N,F<:Function} = () -=# \ No newline at end of file +# Recusion terminator for above +@inline leftshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) +@inline rightshift_carry(::Tuple{}, nbits::Integer, carry::Unsigned) = (carry, ()) diff --git a/test/access.jl b/test/access.jl deleted file mode 100644 index aa4a35e..0000000 --- a/test/access.jl +++ /dev/null @@ -1,96 +0,0 @@ -@testset "Access and Iterations" begin - dna_kmer = mer"ACTG"dna - rna_kmer = mer"ACUG"rna - aa_kmer = mer"MVXN"aa - - @testset "Access DNA Kmer" begin - @test dna_kmer[1] == DNA_A - @test dna_kmer[2] == DNA_C - @test dna_kmer[3] == DNA_T - @test dna_kmer[4] == DNA_G - - @test dna_kmer[1:3] == mer"ACT"dna - @test dna_kmer[2:4] == mer"CTG"dna - - # Access indexes out of bounds - @test_throws BoundsError dna_kmer[-1] - @test_throws BoundsError dna_kmer[0] - @test_throws BoundsError dna_kmer[5] - @test_throws BoundsError getindex(dna_kmer,-1) - @test_throws BoundsError getindex(dna_kmer, 0) - @test_throws BoundsError getindex(dna_kmer, 5) - @test_throws BoundsError dna_kmer[3:7] - end - - @testset "Iteration through DNA Kmer" begin - @test iterate(DNAKmer("ACTG")) == (DNA_A, 2) - - @test iterate(DNAKmer("ACTG"), 1) == (DNA_A, 2) - @test iterate(DNAKmer("ACTG"), 4) == (DNA_G, 5) - - @test iterate(DNAKmer("ACTG"), 1) !== nothing - @test iterate(DNAKmer("ACTG"), 4) !== nothing - @test iterate(DNAKmer("ACTG"), 5) === nothing - @test isnothing(iterate(DNAKmer("ACTG"), -1)) - @test iterate(DNAKmer("ACTG"), 0) === nothing - - dna_vec = [DNA_A, DNA_C, DNA_T, DNA_G] - @test all([nt === dna_vec[i] for (i, nt) in enumerate(dna_kmer)]) - end - - @testset "Access RNA Kmer" begin - @test rna_kmer[1] == RNA_A - @test rna_kmer[2] == RNA_C - @test rna_kmer[3] == RNA_U - @test rna_kmer[4] == RNA_G - - @test rna_kmer[1:3] == mer"ACU"rna - @test rna_kmer[2:4] == mer"CUG"rna - - # Access indexes out of bounds - @test_throws BoundsError rna_kmer[-1] - @test_throws BoundsError rna_kmer[0] - @test_throws BoundsError rna_kmer[5] - @test_throws BoundsError getindex(rna_kmer, -1) - @test_throws BoundsError getindex(rna_kmer, 0) - @test_throws BoundsError getindex(rna_kmer, 5) - @test_throws BoundsError rna_kmer[3:7] - end - - @testset "Iteration through RNA Kmer" begin - @test iterate(RNAKmer("ACUG")) == (RNA_A, 2) - - - @test iterate(RNAKmer("ACUG"), 1) == (RNA_A, 2) - @test iterate(RNAKmer("ACUG"), 4) == (RNA_G, 5) - - - @test iterate(RNAKmer("ACUG"), 1) !== nothing - @test iterate(RNAKmer("ACUG"), 4) !== nothing - @test iterate(RNAKmer("ACUG"), 5) === nothing - @test iterate(RNAKmer("ACUG"), -1) === nothing - @test iterate(RNAKmer("ACUG"), 0) === nothing - - rna_vec = [RNA_A, RNA_C, RNA_U, RNA_G] - @test all([nt === rna_vec[i] for (i, nt) in enumerate(rna_kmer)]) - end - - @testset "Access AA Kmer" begin - @test aa_kmer[1] == AA_M - @test aa_kmer[2] == AA_V - @test aa_kmer[3] == AA_X - @test aa_kmer[4] == AA_N - - @test aa_kmer[1:3] == mer"MVX"aa - @test aa_kmer[2:4] == mer"VXN"aa - - # Access indexes out of bounds - @test_throws BoundsError aa_kmer[-1] - @test_throws BoundsError aa_kmer[0] - @test_throws BoundsError aa_kmer[5] - @test_throws BoundsError getindex(aa_kmer,-1) - @test_throws BoundsError getindex(aa_kmer, 0) - @test_throws BoundsError getindex(aa_kmer, 5) - @test_throws BoundsError aa_kmer[3:7] - end -end diff --git a/test/biosequences_interface.jl b/test/biosequences_interface.jl deleted file mode 100644 index 5db38a3..0000000 --- a/test/biosequences_interface.jl +++ /dev/null @@ -1,12 +0,0 @@ -@testset "BioSequences Interface" begin - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{DNAAlphabet{2},31}), rand(ACGT, 31), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{DNAAlphabet{4},31}), rand(ACGT, 31), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{RNAAlphabet{2},31}), rand(ACGU, 31), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{RNAAlphabet{4},31}), rand(ACGU, 31), false) - - - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{DNAAlphabet{2},200}), rand(ACGT, 200), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{DNAAlphabet{4},200}), rand(ACGT, 200), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{RNAAlphabet{2},200}), rand(ACGU, 200), false) - @test BioSequences.has_interface(BioSequence, Kmers.kmertype(Kmer{RNAAlphabet{4},200}), rand(ACGU, 200), false) -end \ No newline at end of file diff --git a/test/comparisons.jl b/test/comparisons.jl deleted file mode 100644 index 78bf48a..0000000 --- a/test/comparisons.jl +++ /dev/null @@ -1,64 +0,0 @@ -@testset "Comparisons" begin - @testset "Equality" begin - function check_seq_kmer_equality(len) - a = DNAKmer(random_dna_kmer(len)) - b = LongDNA{4}(a) - c = LongDNA{2}(a) - return a == b == c && c == b == a - end - - for len in [1, 10, 32, 64, 128] - @test all(Bool[check_seq_kmer_equality(len) for _ in 1:reps]) - end - - # True negatives - @test DNAKmer("ACG") != RNAKmer("ACG") - @test DNAKmer("T") != RNAKmer("U") - @test DNAKmer("AC") != DNAKmer("AG") - @test RNAKmer("AC") != RNAKmer("AG") - @test AAKmer("MV") != AAKmer("NM") - - @test DNAKmer("ACG") != rna"ACG" - @test DNAKmer("T") != rna"U" - @test DNAKmer("AC") != dna"AG" - @test RNAKmer("AC") != rna"AG" - @test AAKmer("MV") != aa"NM" - - @test rna"ACG" != DNAKmer("ACG") - @test rna"U" != DNAKmer("T") - @test dna"AG" != DNAKmer("AC") - @test rna"AG" != RNAKmer("AC") - @test aa"MV" != AAKmer("NM") - end - - @testset "Inequality" begin - for len in [1, 10, 32, 64] - if len <= 32 - @test isless(DNAKmer{1}((UInt64(0),)), DNAKmer{1}((UInt64(1),))) - @test !isless(DNAKmer{1}((UInt64(0),)), DNAKmer{1}((UInt64(0),))) - @test !isless(DNAKmer{1}((UInt64(1),)), DNAKmer{1}((UInt64(0),))) - - @test isless(RNAKmer{1}((UInt64(0),)), RNAKmer{1}((UInt64(1),))) - @test !isless(RNAKmer{1}((UInt64(0),)), RNAKmer{1}((UInt64(0),))) - @test !isless(RNAKmer{1}((UInt64(1),)), RNAKmer{1}((UInt64(0),))) - end - end - end - - @testset "Hash" begin - kmers = map(DNAKmer, ["AAAA", "AACT", "ACGT", "TGCA"]) - for x in kmers, y in kmers - @test (x == y) == (hash(x) == hash(y)) - end - - kmers = map(RNAKmer, ["AAAA", "AACU", "ACGU", "UGCA"]) - for x in kmers, y in kmers - @test (x == y) == (hash(x) == hash(y)) - end - - kmers = map(AAKmer, ["AMVK", "FPST", "QEGH", "ARND"]) - for x in kmers, y in kmers - @test (x == y) == (hash(x) == hash(y)) - end - end -end diff --git a/test/construction_and_conversion.jl b/test/construction_and_conversion.jl deleted file mode 100644 index 567042c..0000000 --- a/test/construction_and_conversion.jl +++ /dev/null @@ -1,170 +0,0 @@ -global reps = 10 - -@testset "Construction and Conversions" begin - @test Kmer(DNA_A, DNA_G, DNA_T) === Kmer("AGT") - @test Kmer(RNA_A, RNA_G, RNA_U) === Kmer("AGU") - #@test Kmer(AA_R, AA_D, AA_C, AA_B) === Kmer("RDCB") - - @test DNAKmer(DNA_G, DNA_C, DNA_T) == Kmer("GCT") - @test RNAKmer(RNA_G, RNA_U, RNA_C, RNA_U) == Kmer("GUCU") - - # creation from iterator - @test Kmers.kmertype(Kmer{DNAAlphabet{2},31})((i for i in rand(ACGT, 31))) isa Kmers.kmertype(Kmer{DNAAlphabet{2},31}) - - # Check that kmers in strings survive round trip conversion: - # String → Kmer → String - function check_string_construction(::Type{T}, seq::AbstractString) where {T<:Kmer} - return String(T(seq)) == uppercase(seq) - end - - # Check that RNAKmers can be constructed from a LongRNASeq - # LongSequence{A} → Kmer{A,K,N} → LongSequence{A} - function check_longsequence_construction(::Type{T}, seq::S) where {T<:Kmer,S<:LongSequence} - return S(T(seq)) == seq - end - - # Check that kmers can be constructed from a BioSequence - # BioSequence → Kmer → BioSequence - function check_biosequence_construction(::Type{T}, seq::LongSequence) where {T<:Kmer} - return LongSequence(T(seq)) == seq - end - - # Check that kmers can be constructed from an array of nucleotides - # Vector{T} → Kmer → Vector{T} - function check_nucarray_kmer(::Type{M}, seq::Vector{T}) where {T,M<:Kmer} - return String([convert(Char, c) for c in seq]) == String(M(seq)) - end - - # Check that kmers in strings survive round trip conversion: - # String → BioSequence → Kmer → BioSequence → String - function check_roundabout_construction(::Type{T}, A2, seq::AbstractString) where {T<:Kmer} - return String(LongSequence{A2}(T(LongSequence{A2}(seq)))) == uppercase(seq) - end - - #= - function check_uint_conversion(::Type{T}) where {T<:Kmer} - U = BioSequences.encoded_data_type(T) - uint = rand(typemin(U):U(one(U) << 2BioSequences.ksize(T) - 1)) - return convert(U, T(uint)) === uint - end - =# - - @testset "Kmer conversion" begin - for len in [1, 16, 32, 64, 128] - # String construction - # Check that kmers in strings survive round trip conversion: - # String → Kmer → String - @test all(Bool[check_string_construction(DNAKmer{len}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_string_construction(Kmer{DNAAlphabet{4},len}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_string_construction(RNAKmer{len}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_string_construction(Kmer{RNAAlphabet{4},len}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_string_construction(AAKmer{len}, random_aa(len)) for _ in 1:reps]) - - # Long(DNA|RNA)Seq Constructions - # Check that DNAKmers can be constructed from a Long(DNA|RNA)Seq - # Long(DNA|RNA)Seq → Kmer → Long(DNA|RNA)Seq - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{2},len}, LongDNA{2}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{4},len}, LongDNA{4}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{4},len}, LongDNA{2}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{2},len}, LongDNA{4}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{2},len}, LongRNA{2}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{4},len}, LongRNA{4}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{4},len}, LongRNA{2}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{2},len}, LongRNA{4}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(AAKmer{len}, LongAA(random_aa(len))) for _ in 1:reps]) - - # Check Kmer{A1}(::BioSequence{A2}) for compatible A1 and A2 - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{4}}, LongRNA{2}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{2}}, LongDNA{4}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{RNAAlphabet{4}}, LongDNA{4}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer{DNAAlphabet{2}}, LongRNA{4}(random_rna_kmer(len))) for _ in 1:reps]) - - # BioSequence Construction - # Check that kmers can be constructed from a BioSequence - # BioSequence → Kmer → BioSequence - @test all(Bool[check_biosequence_construction(Kmer{DNAAlphabet{2},len}, LongSequence{DNAAlphabet{2}}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{DNAAlphabet{4},len}, LongSequence{DNAAlphabet{4}}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{DNAAlphabet{2},len}, LongSequence{DNAAlphabet{4}}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{DNAAlphabet{4},len}, LongSequence{DNAAlphabet{2}}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{RNAAlphabet{2},len}, LongSequence{RNAAlphabet{2}}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{RNAAlphabet{4},len}, LongSequence{RNAAlphabet{4}}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{RNAAlphabet{2},len}, LongSequence{RNAAlphabet{4}}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(Kmer{RNAAlphabet{4},len}, LongSequence{RNAAlphabet{2}}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_biosequence_construction(AAKmer{len}, LongSequence{AminoAcidAlphabet}(random_aa(len))) for _ in 1:reps]) - - # Check Kmer(::BioSequence) construction - @test all(Bool[check_longsequence_construction(Kmer, LongRNA{4}(random_rna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer, LongDNA{2}(random_dna_kmer(len))) for _ in 1:reps]) - @test all(Bool[check_longsequence_construction(Kmer, LongAA(random_rna_kmer(len))) for _ in 1:reps]) - - # Construction from element arrays - # Check that kmers can be constructed from an array of elements - # Vector{T} → Kmer{A,K,N} → Vector{T} - @test all(Bool[check_nucarray_kmer(Kmer{DNAAlphabet{2},len}, random_dna_symbols(len, [0.25, 0.25, 0.25, 0.25, 0.0])) for _ in 1:reps]) - @test all(Bool[check_nucarray_kmer(Kmer{DNAAlphabet{4},len}, random_dna_symbols(len)) for _ in 1:reps]) - @test all(Bool[check_nucarray_kmer(Kmer{RNAAlphabet{2},len}, random_rna_symbols(len, [0.25, 0.25, 0.25, 0.25, 0.0])) for _ in 1:reps]) - @test all(Bool[check_nucarray_kmer(Kmer{RNAAlphabet{4},len}, random_rna_symbols(len)) for _ in 1:reps]) - @test all(Bool[check_nucarray_kmer(AAKmer{len}, random_aa_symbols(len)) for _ in 1:reps]) - - # Roundabout conversions - @test all(Bool[check_roundabout_construction(Kmer{DNAAlphabet{2},len}, DNAAlphabet{2}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{DNAAlphabet{4},len}, DNAAlphabet{4}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{DNAAlphabet{2},len}, DNAAlphabet{4}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{DNAAlphabet{4},len}, DNAAlphabet{2}, random_dna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{RNAAlphabet{2},len}, RNAAlphabet{2}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{RNAAlphabet{4},len}, RNAAlphabet{4}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{RNAAlphabet{2},len}, RNAAlphabet{4}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(Kmer{RNAAlphabet{4},len}, RNAAlphabet{2}, random_rna_kmer(len)) for _ in 1:reps]) - @test all(Bool[check_roundabout_construction(AAKmer{len}, AminoAcidAlphabet, random_aa(len)) for _ in 1:reps]) - end - end - - @test_throws MethodError Kmer() # can't construct 0-mer using `Kmer()` - @test_throws ArgumentError DNAKmer(dna"") # 0-mers not allowed - @test_throws ArgumentError AAKmer(aa"") # 0-mers not allowed - @test_throws ArgumentError DNAKmer{0}(UInt64(0)) # 0-mers not allowed - @test_throws ArgumentError RNAKmer{0}(UInt64(0)) # 0-mers not allowed - @test_throws ArgumentError AAKmer{0}(UInt64(0)) # 0-mers not allowed - @test_throws BioSequences.EncodeError Kmer(RNA_A, RNA_C, RNA_G, RNA_N, RNA_U) # no Ns in kmers - @test_throws BioSequences.EncodeError Kmer(DNA_A, DNA_C, DNA_G, DNA_N, DNA_T) # no Ns in kmers - @test_throws BioSequences.EncodeError RNAKmer(rna"ACGNU") # no Ns in 2-bit nucleic acid kmers - @test_throws BioSequences.EncodeError DNAKmer(dna"ACGNT") # no Ns in 2-bit nucleic acid kmers - @test_throws MethodError Kmer(RNA_A, DNA_A) # no mixing of RNA and DNA - - @testset "From strings" begin - @test DNAKmer("ACTG") == DNAKmer(LongDNA{4}("ACTG")) - @test RNAKmer("ACUG") == RNAKmer(LongRNA{4}("ACUG")) - - # N is not allowed in Kmers - @test_throws Exception DNAMmer("ACGTNACGT") - @test_throws Exception RNAKmer("ACGUNACGU") - - # Test string literals - @test mer"ACTG"dna == DNAKmer(LongDNA{4}("ACTG")) - @test mer"AVBM"aa == AAKmer(LongAA("AVBM")) - @test isa(mer"ACGT"dna, DNAKmer{4}) - @test isa(mer"AVBM"aa, AAKmer{4}) - @test_throws LoadError eval(:(mer"ACGN"dna)) - @test_throws LoadError eval(:(mer"ACG-"dna)) - end - - @testset "Capacity" begin - @test Kmers.capacity(DNAKmer(random_dna_kmer(10))) == 32 - @test Kmers.capacity(RNAKmer(random_rna_kmer(10))) == 32 - @test Kmers.capacity(DNAKmer(random_dna_kmer(32))) == 32 - @test Kmers.capacity(RNAKmer(random_rna_kmer(32))) == 32 - @test Kmers.capacity(DNAKmer(random_dna_kmer(33))) == 64 - @test Kmers.capacity(AAKmer(random_aa(8))) == 8 - @test Kmers.capacity(AAKmer(random_aa(10))) == 16 - end - - @testset "N unused" begin - @test Kmers.n_unused(DNAKmer(random_dna_kmer(10))) == 22 - @test Kmers.n_unused(RNAKmer(random_rna_kmer(10))) == 22 - @test Kmers.n_unused(DNAKmer(random_dna_kmer(32))) == 0 - @test Kmers.n_unused(RNAKmer(random_rna_kmer(32))) == 0 - @test Kmers.n_unused(DNAKmer(random_dna_kmer(33))) == 31 - @test Kmers.n_unused(AAKmer(random_aa(8))) == 0 - @test Kmers.n_unused(AAKmer(random_aa(10))) == 6 - end -end diff --git a/test/debruijn_neighbors.jl b/test/debruijn_neighbors.jl deleted file mode 100644 index c011b84..0000000 --- a/test/debruijn_neighbors.jl +++ /dev/null @@ -1,6 +0,0 @@ -@testset "De Bruijn Neighbors" begin - @test collect(fw_neighbors(DNAKmer("ACG"))) == map(DNAKmer, ["CGA", "CGC", "CGG", "CGT"]) - @test collect(fw_neighbors(DNAKmer("GGGG"))) == map(DNAKmer, ["GGGA", "GGGC", "GGGG", "GGGT"]) - @test collect(fw_neighbors(RNAKmer("ACG"))) == map(RNAKmer, ["CGA", "CGC", "CGG", "CGU"]) - @test collect(fw_neighbors(RNAKmer("GGGG"))) == map(RNAKmer, ["GGGA", "GGGC", "GGGG", "GGGU"]) -end diff --git a/test/find.jl b/test/find.jl deleted file mode 100644 index c3d4aaa..0000000 --- a/test/find.jl +++ /dev/null @@ -1,46 +0,0 @@ -@testset "Find" begin - kmer = DNAKmer("ACGAG") - - @test findnext(DNA_A, kmer, 1) == 1 - @test findnext(DNA_C, kmer, 1) == 2 - @test findnext(DNA_G, kmer, 1) == 3 - @test findnext(DNA_T, kmer, 1) == nothing - @test findnext(DNA_A, kmer, 2) == 4 - - @test_throws BoundsError findnext(DNA_A, kmer, 0) - @test findnext(DNA_A, kmer, 6) === nothing - - @test findprev(DNA_A, kmer, 5) == 4 - @test findprev(DNA_C, kmer, 5) == 2 - @test findprev(DNA_G, kmer, 5) == 5 - @test findprev(DNA_T, kmer, 5) == nothing - @test findprev(DNA_G, kmer, 4) == 3 - - @test findprev(DNA_A, kmer, 0) === nothing - @test_throws BoundsError findprev(DNA_A, kmer, 6) - - @test findfirst(DNA_A, kmer) == 1 - @test findfirst(DNA_G, kmer) == 3 - @test findlast(DNA_A, kmer) == 4 - @test findlast(DNA_G, kmer) == 5 - - kmer = AAKmer("AMVKFPSMT") - - @test findnext(AA_A, kmer, 1) == 1 - @test findnext(AA_M, kmer, 1) == 2 - @test findnext(AA_V, kmer, 1) == 3 - @test findnext(AA_K, kmer, 1) == 4 - @test findnext(AA_F, kmer, 1) == 5 - @test findnext(AA_P, kmer, 1) == 6 - @test findnext(AA_S, kmer, 1) == 7 - @test findnext(AA_M, kmer, 1) == 2 - @test findnext(AA_T, kmer, 1) == 9 - - @test findnext(AA_F, kmer, 4) == 5 - @test findprev(AA_F, kmer, 4) == nothing - @test findnext(AA_A, kmer, 7) == nothing - @test findnext(AA_M, kmer, 5) == 8 - - @test findfirst(AA_M, kmer) == 2 - @test findlast(AA_M, kmer) == 8 -end diff --git a/test/iteration.jl b/test/iteration.jl deleted file mode 100644 index 801a4b0..0000000 --- a/test/iteration.jl +++ /dev/null @@ -1,214 +0,0 @@ -@testset "EveryKmer" begin - @testset "EveryKmer DNA" begin - s = randdnaseq(500) - s2 = LongDNA{2}(s) - # Kmer and sequence Alphabets match. - @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) - @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - - @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) - @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 - - # Kmer and sequence Alphabets mismatch. - s3 = dna"AC-TGAG--TGC" - @test collect(EveryKmer{DNACodon}(s3)) == - [(UInt64(4), Kmer(DNA_T, DNA_G, DNA_A)), - (UInt64(5), Kmer(DNA_G, DNA_A, DNA_G)), - (UInt64(10), Kmer(DNA_T, DNA_G, DNA_C))] - end - - @testset "EveryKmer RNA" begin - s = randrnaseq(500) - s2 = LongRNA{2}(s) - # Kmer and sequence Alphabets match. - @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) - @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - - @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) - @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 - - # Kmer and sequence Alphabets mismatch. - s3 = rna"AC-UGAG--UGC" - @test collect(EveryKmer{RNACodon}(s3)) == - [(UInt64(4), Kmer(RNA_U, RNA_G, RNA_A)), - (UInt64(5), Kmer(RNA_G, RNA_A, RNA_G)), - (UInt64(10), Kmer(RNA_U, RNA_G, RNA_C))] - end - - @testset "EveryKmer AA" begin - s = randaaseq(500) - s2 = LongAA(s) - @test collect(EveryKmer(s, Val{31}())) == collect(EveryKmer(s2, Val{31}())) - @test length(EveryKmer(s, Val{31}())) == length(EveryKmer(s2, Val{31}())) == 470 - - @test collect(EveryKmer(s, Val{201}())) == collect(EveryKmer(s2, Val{201}())) - @test length(EveryKmer(s, Val{201}())) == length(EveryKmer(s2, Val{201}())) == 300 - end -end - -@testset "SpacedKmers" begin - @testset "SpacedKmers DNA" begin - s = randdnaseq(500) - s2 = LongDNA{2}(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == length(SpacedKmers(s2, Val{31}(), 50)) == 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == length(SpacedKmers(s2, Val{201}(), 50)) == 6 - - s3 = dna"AC-TGAG--TGC" - @test collect(SpacedKmers{DNACodon}(s3, 3)) == - [(UInt64(4), Kmer(DNA_T, DNA_G, DNA_A)), - (UInt64(10), Kmer(DNA_T, DNA_G, DNA_C))] - end - - @testset "SpacedKmers RNA" begin - s = randrnaseq(500) - s2 = LongRNA{2}(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == length(SpacedKmers(s2, Val{31}(), 50)) == 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == length(SpacedKmers(s2, Val{201}(), 50)) == 6 - - s3 = rna"AC-UGAG--UGC" - @test collect(SpacedKmers{RNACodon}(s3, 3)) == - [(UInt64(4), Kmer(RNA_U, RNA_G, RNA_A)), - (UInt64(10), Kmer(RNA_U, RNA_G, RNA_C))] - end - - @testset "SpacedKmers AA" begin - s = randaaseq(500) - s2 = LongAA(s) - @test collect(SpacedKmers(s, Val{31}(), 50)) == collect(SpacedKmers(s2, Val{31}(), 50)) - @test length(SpacedKmers(s, Val{31}(), 50)) == length(SpacedKmers(s2, Val{31}(), 50)) == 10 - - @test collect(SpacedKmers(s, Val{201}(), 50)) == collect(SpacedKmers(s2, Val{201}(), 50)) - @test length(SpacedKmers(s, Val{201}(), 50)) == length(SpacedKmers(s2, Val{201}(), 50)) == 6 - end -end - -@testset "EveryCanonicalKmer" begin - @testset "EveryCanonicalKmer DNA" begin - s = randdnaseq(500) - s2 = LongDNA{2}(s) - - # Iterator generates expected results... - ## 2-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{31}())] == - collect(EveryCanonicalKmer(s2, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{201}())] == - collect(EveryCanonicalKmer(s2, Val{201}())) - - ## 4-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{31}())] == - collect(EveryCanonicalKmer(s, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{201}())] == - collect(EveryCanonicalKmer(s, Val{201}())) - - # Test equivalency between different levels of bit compression... - @test [x[2] for x in EveryCanonicalKmer(s, Val{31}())] == - [x[2] for x in EveryCanonicalKmer(s2, Val{31}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{31}())])) && - all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{31}())])) - - @test [x[2] for x in EveryCanonicalKmer(s, Val{201}())] == - [x[2] for x in EveryCanonicalKmer(s2, Val{201}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{201}())])) && - all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{201}())])) - - # Kmer and sequence Alphabets mismatch. - s3 = dna"AC-TGAG--TGC" - @test collect(EveryCanonicalKmer{DNACodon}(s3)) == - [(UInt64(4), canonical(Kmer(DNA_T, DNA_G, DNA_A))), - (UInt64(5), canonical(Kmer(DNA_G, DNA_A, DNA_G))), - (UInt64(10), canonical(Kmer(DNA_T, DNA_G, DNA_C)))] - end - - @testset "EveryCanonicalKmer RNA" begin - s = randrnaseq(500) - s2 = LongRNA{2}(s) - - # Iterator generates expected results... - ## 2-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{31}())] == - collect(EveryCanonicalKmer(s2, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s2, Val{201}())] == - collect(EveryCanonicalKmer(s2, Val{201}())) - - ## 4-Bit DNA - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{31}())] == - collect(EveryCanonicalKmer(s, Val{31}())) - - @test [(x[1], canonical(x[2])) for x in EveryKmer(s, Val{201}())] == - collect(EveryCanonicalKmer(s, Val{201}())) - - # Test equivalency between different levels of bit compression... - @test [x[2] for x in EveryCanonicalKmer(s, Val{31}())] == - [x[2] for x in EveryCanonicalKmer(s2, Val{31}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{31}())])) && - all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{31}())])) - - @test [x[2] for x in EveryCanonicalKmer(s, Val{201}())] == - [x[2] for x in EveryCanonicalKmer(s2, Val{201}())] - @test all(iscanonical.([x[2] for x in EveryCanonicalKmer(s, Val{201}())])) && - all(iscanonical.([x[2] for x in EveryCanonicalKmer(s2, Val{201}())])) - - s3 = rna"AC-UGAG--UGC" - @test collect(EveryCanonicalKmer{RNACodon}(s3)) == - [(UInt64(4), canonical(Kmer(RNA_U, RNA_G, RNA_A))), - (UInt64(5), canonical(Kmer(RNA_G, RNA_A, RNA_G))), - (UInt64(10), canonical(Kmer(RNA_U, RNA_G, RNA_C)))] - end -end - -@testset "SpacedCanonicalKmers" begin - @testset "SpacedCanonicalKmers DNA" begin - s = randdnaseq(500) - s2 = LongDNA{2}(s) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{31}(), 50)) == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{31}(), 50)) == length(SpacedCanonicalKmers(s2, Val{31}(), 50)) == 10 - - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{201}(), 50)) == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{201}(), 50)) == length(SpacedCanonicalKmers(s2, Val{201}(), 50)) == 6 - - s3 = dna"AC-TGAG--TGC" - @test collect(SpacedCanonicalKmers{DNACodon}(s3, 3)) == - [(UInt64(4), canonical(Kmer(DNA_T, DNA_C, DNA_A))), - (UInt64(10), canonical(Kmer(DNA_T, DNA_G, DNA_C)))] - end - - @testset "SpacedCanonicalKmers RNA" begin - s = randrnaseq(500) - s2 = LongRNA{2}(s) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{31}(), 50)] == collect(SpacedCanonicalKmers(s, Val{31}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{31}(), 50)) == collect(SpacedCanonicalKmers(s2, Val{31}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{31}(), 50)) == length(SpacedCanonicalKmers(s2, Val{31}(), 50)) == 10 - - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test [(x[1], canonical(x[2])) for x in SpacedKmers(s2, Val{201}(), 50)] == collect(SpacedCanonicalKmers(s, Val{201}(), 50)) - @test collect(SpacedCanonicalKmers(s, Val{201}(), 50)) == collect(SpacedCanonicalKmers(s2, Val{201}(), 50)) - @test length(SpacedCanonicalKmers(s, Val{201}(), 50)) == length(SpacedCanonicalKmers(s2, Val{201}(), 50)) == 6 - - s3 = rna"AC-UGAG--UGC" - @test collect(SpacedCanonicalKmers{RNACodon}(s3, 3)) == - [(UInt64(4), canonical(Kmer(RNA_U, RNA_C, RNA_A))), - (UInt64(10), canonical(Kmer(RNA_U, RNA_G, RNA_C)))] - end -end \ No newline at end of file diff --git a/test/length.jl b/test/length.jl deleted file mode 100644 index 598980e..0000000 --- a/test/length.jl +++ /dev/null @@ -1,7 +0,0 @@ -@testset "Length" begin - for len in [1, 16, 32, 64, 128] - @test length(DNAKmer(random_dna_kmer(len))) == len - @test length(RNAKmer(random_rna_kmer(len))) == len - @test length(AAKmer(random_aa(len))) == len - end -end diff --git a/test/mismatches.jl b/test/mismatches.jl deleted file mode 100644 index b8e1bb0..0000000 --- a/test/mismatches.jl +++ /dev/null @@ -1,51 +0,0 @@ -@testset "Mismatches" begin - function test_mismatches(a, b) - count = 0 - for (x, y) in zip(a, b) - count += x != y - end - @test mismatches(a, b) === mismatches(b, a) === count - end - - for len in 1:64, _ in 1:10 - a = random_dna_kmer(len) - b = random_dna_kmer(len) - test_mismatches(DNAKmer(a), DNAKmer(b)) - test_mismatches(Kmer{DNAAlphabet{4}}(a), Kmer{DNAAlphabet{4}}(b)) - - a = random_rna_kmer(len) - b = random_rna_kmer(len) - test_mismatches(RNAKmer(a), RNAKmer(b)) - test_mismatches(Kmer{RNAAlphabet{4}}(a), Kmer{RNAAlphabet{4}}(b)) - - a = AAKmer(random_aa(len)) - b = AAKmer(random_aa(len)) - test_mismatches(a, b) - end -end - -@testset "Matches" begin - function test_matches(a, b) - count = 0 - for (x, y) in zip(a, b) - count += x == y - end - @test matches(a, b) === matches(b, a) === count - end - - for len in 1:64, _ in 1:10 - a = random_dna_kmer(len) - b = random_dna_kmer(len) - test_matches(DNAKmer(a), DNAKmer(b)) - test_matches(Kmer{DNAAlphabet{4}}(a), Kmer{DNAAlphabet{4}}(b)) - - a = random_rna_kmer(len) - b = random_rna_kmer(len) - test_matches(RNAKmer(a), RNAKmer(b)) - test_matches(Kmer{RNAAlphabet{4}}(a), Kmer{RNAAlphabet{4}}(b)) - - a = AAKmer(random_aa(len)) - b = AAKmer(random_aa(len)) - test_matches(a, b) - end -end \ No newline at end of file diff --git a/test/order.jl b/test/order.jl deleted file mode 100644 index 8b08201..0000000 --- a/test/order.jl +++ /dev/null @@ -1,7 +0,0 @@ -@testset "Order" begin - @test DNAMer("AA") < DNAMer("AC") < DNAMer("AG") < DNAMer("AT") < DNAMer("CA") - @test RNAMer("AA") < RNAMer("AC") < RNAMer("AG") < RNAMer("AU") < RNAMer("CA") - - @test BigDNAMer("AA") < BigDNAMer("AC") < BigDNAMer("AG") < BigDNAMer("AT") < BigDNAMer("CA") - @test BigRNAMer("AA") < BigRNAMer("AC") < BigRNAMer("AG") < BigRNAMer("AU") < BigRNAMer("CA") -end diff --git a/test/print.jl b/test/print.jl deleted file mode 100644 index 9e8f486..0000000 --- a/test/print.jl +++ /dev/null @@ -1,37 +0,0 @@ -@testset "Print" begin - buf = IOBuffer() - - print(buf, DNAKmer("ACGT")) - @test String(take!(buf)) == "ACGT" - - print(buf, RNAKmer("ACGU")) - @test String(take!(buf)) == "ACGU" - - print(buf, Kmer{DNAAlphabet{4}}("ACGT")) - @test String(take!(buf)) == "ACGT" - - print(buf, Kmer{RNAAlphabet{4}}("ACGU")) - @test String(take!(buf)) == "ACGU" - - print(buf, AAKmer("AMVKFPSMT")) - @test String(take!(buf)) == "AMVKFPSMT" -end - -@testset "Show" begin - buf = IOBuffer() - - show(buf, DNAKmer("AGAGT")) - @test String(take!(buf)) == "AGAGT" - - show(buf, RNAKmer("AGAGU")) - @test String(take!(buf)) == "AGAGU" - - show(buf, Kmer{DNAAlphabet{4}}("AGAGT")) - @test String(take!(buf)) == "AGAGT" - - show(buf, Kmer{RNAAlphabet{4}}("AGAGU")) - @test String(take!(buf)) == "AGAGU" - - print(buf, AAKmer("AMVKFPSMT")) - @test String(take!(buf)) == "AMVKFPSMT" -end diff --git a/test/random.jl b/test/random.jl deleted file mode 100644 index 48074e7..0000000 --- a/test/random.jl +++ /dev/null @@ -1,26 +0,0 @@ -@testset "Random" begin - @testset for k in 1:64 - for _ in 1:10 - kmer = rand(DNAKmer{k}) - @test isa(kmer, DNAKmer{k}) - kmer = rand(RNAKmer{k}) - @test isa(kmer, RNAKmer{k}) - end - for size in [0, 1, 2, 5, 10, 100] - @test length(rand(DNAKmer{k}, size)) == size - @test length(rand(RNAKmer{k}, size)) == size - end - kmers = rand(DNAKmer{k}, 10_000) - for i in 1:k - a = sum([kmer[i] for kmer in kmers] .== DNA_A) - c = sum([kmer[i] for kmer in kmers] .== DNA_C) - g = sum([kmer[i] for kmer in kmers] .== DNA_G) - t = sum([kmer[i] for kmer in kmers] .== DNA_T) - @test 2200 ≤ a ≤ 2800 - @test 2200 ≤ c ≤ 2800 - @test 2200 ≤ g ≤ 2800 - @test 2200 ≤ t ≤ 2800 - @test a + c + g + t == 10_000 - end - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 4260278..c55b703 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,28 +1,576 @@ module TestKmers -using Kmers -using BioSequences using Test - -const GROUP = get(ENV, "GROUP", "All") +using Random +using Kmers +using BioSequences +using BioSymbols include("utils.jl") -if GROUP == "BioSequences" || GROUP == "All" - include("biosequences_interface.jl") - include("construction_and_conversion.jl") - include("comparisons.jl") - include("length.jl") - include("access.jl") - include("random.jl") - include("find.jl") - include("print.jl") - include("transformations.jl") - include("mismatches.jl") - include("debruijn_neighbors.jl") - include("iteration.jl") - include("translation.jl") - #include("shuffle.jl") +@testset "BioSequences Interface" begin + for A in + [DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4}, AminoAcidAlphabet] + for K in (1, 9, 116) + @test BioSequences.has_interface( + BioSequence, + Kmers.derive_type(Kmer{A, K}), + rand(collect(A()), K), + false, + ) + end + end +end + +struct CharSymbol <: BioSymbol + x::Char +end +BioSymbols.prefix(::CharSymbol) = "Char" +BioSymbols.type_text(::CharSymbol) = "CharSymbol" +BioSymbols.isterm(::CharSymbol) = false + +# These two are not interface, but useful for the tests below +# (e.g. converting the sequence to a string) +Base.convert(::Type{Char}, x::CharSymbol) = x.x +Base.Char(x::CharSymbol) = convert(Char, x) +Base.convert(::Type{CharSymbol}, x::Char) = CharSymbol(x) +BioSymbols.isgap(::CharSymbol) = false + +# TODO: Should BioSymbols be updated to remove this? +BioSymbols.isvalid(::CharSymbol) = true + +struct CharAlphabet <: Alphabet end +Base.eltype(::Type{CharAlphabet}) = CharSymbol +BioSequences.symbols(::CharAlphabet) = ntuple(i -> CharSymbol(Char(i - 1)), Val{128}()) +BioSequences.encode(::CharAlphabet, c::CharSymbol) = reinterpret(UInt32, c.x) % UInt +BioSequences.decode(::CharAlphabet, c::UInt) = CharSymbol(reinterpret(Char, c % UInt32)) +BioSequences.BitsPerSymbol(::CharAlphabet) = BioSequences.BitsPerSymbol{32}() + +const ALPHABETS = [ + DNAAlphabet{2}(), + RNAAlphabet{2}(), + DNAAlphabet{4}(), + RNAAlphabet{4}(), + AminoAcidAlphabet(), + CharAlphabet(), +] + +@testset "Construction" begin + # Fundamentals + dna = dna"TAGCTAAC" + mer = Kmer{DNAAlphabet{2}, length(dna)}(dna) + @test mer isa Kmer{DNAAlphabet{2}, length(dna)} + @test DNACodon == DNAKmer{3, 1} + @test RNACodon == RNAKmer{3, 1} + + for A in ALPHABETS + Ta = typeof(A) + for L in [0, 3, 11, 41] + for i in 1:3 + # Fundamentals and length + s = randseq(A, SamplerUniform(symbols(A)), L) + mer = Kmer{Ta, L}(collect(s)) + @test mer isa Kmer{Ta, L} + @test length(mer) == L + @test string(mer) == string(s) + + # From string + mer2 = Kmer{Ta, L}(string(s)) + @test mer == mer2 + @test mer === mer2 + @test string(mer) == string(mer2) + + # From LongSequence + mer3 = Kmer{Ta, L}(s) + @test mer === mer3 + end + end + end + + @testset "Construct from string" begin + for s in ["TAG", "ACCGAGCC", "TGATGCTATTAGG"] + L = length(s) + for ss in (s, view(s, 1:lastindex(s))) + @test DNAKmer{L, 1}(ss) == DNAKmer{L}(ss) + @test string(DNAKmer{L}(ss)) == ss + end + end + + s = "UHVKALRIQURPFLSMOF" + @test string(AAKmer{18}(s)) == s + + for s in ["αβγδϵ", "", "中国人大网"] + L = length(s) + for ss in (s, view(s, 1:lastindex(s))) + sq = Kmer{CharAlphabet, L}(ss) + @test string(sq) == s + @test [Char(i) for i in sq] == collect(ss) + end + end + end + + @testset "Wrong length" begin + @test_throws Exception DNAKmer{4}("TAC") + @test_throws Exception DNAKmer{4}("TACCA") + @test_throws Exception Kmer{CharAlphabet, 2}(['T']) + @test_throws Exception AAMer{3}((AminoAcid(i) for i in "WPLK" if true)) + end + + @testset "Length must be given explicitly" begin + for s in ["TACA", ""] + @test_throws Exception DNAKmer("TACGA") + @test string(DNAKmer{length(s)}(s)) == s + end + @test_throws Exception AAMer(aa"WPLKM") + @test collect(AAKmer{5}(aa"WPLKM")) == collect(aa"WPLKM") + end + + @testset "Kmer literal" begin + @test collect(mer"TGAGTCA"d) == collect(dna"TGAGTCA") + @test collect(mer"WQOPMKAP"a) == collect(aa"WQOPMKAP") + @test collect(mer"UAUCGGAUC"r) == collect(rna"UAUCGGAUC") + end + + @testset "Construct from Biosequences" begin + @testset "Construct from LongSequence" begin + for seq in [ + dna"TAGGCA", + rna"UUCUGUGAGUCC", + aa"TTCGGAA", + LongSequence{CharAlphabet}("HELLO"), + ] + for sq in [seq, view(seq, 2:lastindex(seq))] + A = typeof(Alphabet(sq)) + @test Kmer{A, length(sq)}(sq) == Kmer{A, length(sq)}(string(sq)) + @test string(Kmer{A, length(sq)}(sq)) == string(sq) + @test_throws Exception Kmer{A, length(sq) + 1}(sq) + end + end + end + + @testset "Construct from kmer" begin + m = mer"TAGCGTTA"d + m2 = DNAKmer{8}(m) + @test m === m2 + @test_throws Exception DNAKmer{7}(m) + m3 = RNAKmer{8}(m) + @test m3 === mer"UAGCGUUA"r + @test_throws Exception RNAKmer{9}(m) + @test_throws Exception AAKmer{8}(m) + end + + # From generic biosequence - TODO + end + + @testset "Construct from iterable" begin + m1 = DNAKmer{6}((i for i in dna"GCGATC")) + m2 = DNAKmer{6}((i for i in dna"ATCGATGCAA" if i ∈ (DNA_A, DNA_C))) + @test m1 === mer"GCGATC"d + @test m2 === mer"ACACAA"d + m3 = DNAKmer{4}((i for i in rna"GAUC" if true)) + @test m3 === mer"GATC"d + end +end + +@testset "Comparison" begin + @testset "Equality" begin + @test mer"KMNUPQCX"a == mer"KMNUPQCX"a + @test mer"PKMNEA"a != mer"PKMNE"a + @test mer"IUDHLDJVIPOEJKWE"a != mer"IUDHLDJVIPOEJKW"a + end + + @testset "Ordering" begin + @test mer"UGCAG"r > mer"CGCAG"r + @test mer"TCGGAAG"d > mer"TCGGAAC"d + @test mer"OEWPM"a > mer"OEWP"a + @test mer"UGCGA"r > mer"TGAGA"d + end + + @testset "Hashing, isless and isequal" begin + @test hash(mer"POSMDGF"a, UInt(15)) === hash(mer"POSMDGF"a, UInt(15)) + @test isequal(mer"POSMDGF"a, mer"POSMDGF"a) + + # Same, but DNA/RNA + m1 = mer"TAGCTA"d + m2 = mer"UAGCUA"r + @test isequal(m1, m2) + @test hash(m1) === hash(m2) + m3 = Kmer{DNAAlphabet{4}}(m1) + m4 = Kmer{RNAAlphabet{4}}(m2) + @test isequal(m3, m4) + @test hash(m3) === hash(m4) + + # Other kmers + # This throws because we want kmer hashing to be maximally fast, + # which implies they must have a different hashing strategy from + # other BioSequences, which implies they can't be isequal + @test_throws Exception isequal(mer"UGCUGA"r, mer"UGCUGA"a) + @test !isequal(mer"UGCAC"r, mer"UGCGA"r) + + # Other sequences + @test_throws Exception dna"TAG" == mer"TAG"d + end +end + +@testset "Access" begin + @testset "Scalar indexing" begin + m = mer"TGATGCTAGTAGTATTCTATAG"d + @test m isa Mer{22} + + @test m[1] == first(m) == DNA_T + @test m[3] == DNA_A + @test last(m) == m[22] == DNA_G + + @test_throws BoundsError m[0] + @test_throws BoundsError m[-1] + @test_throws BoundsError m[23] + + for s in + [dna"TAGCAAC", dna"TWKKSVVDNA-A", rna"UGUGUCA", rna"UGUCGWS", aa"PLLKMDDSH"] + m = Kmer{typeof(Alphabet(s)), length(s)}(s) + @test first(m) == first(s) + @test last(m) == last(s) + for i in [1, 3, 5] + @test s[i] == m[i] + end + end + + # Weirdly, this throws ArgumentError + @test_throws Exception first(DNAKmer{0}("")) + @test_throws Exception last(RNAKmer{0}("")) + end + + @testset "Unit ranges" begin + m = mer"POKDGTWDIKVL"a + @test m isa Mer{12} + + @test m[1:3] === mer"POK"a + @test m[2:6] === mer"OKDGT"a + @test m[6:(end - 1)] === mer"TWDIKV"a + + @test m[eachindex(m)] === m + @test m[Base.OneTo(4)] === mer"POKD"a + + @test_throws BoundsError m[0:4] + @test m[0:-1] == AAKmer{0}("") + @test_throws BoundsError m[2:13] + end + + @testset "With vector of indices" begin + m = mer"UGCUGAUCGUAU"r + @test m isa Mer{12} + + @test m[[1, 3, 5]] == mer"UCG"r + @test m[[12, 9, 7]] == mer"UGU"r + @test m[Int[]] == Kmer{RNAAlphabet{2}, 0}("") + + @test_throws BoundsError m[[2, 8, 15]] + @test_throws BoundsError m[[0, 1]] + @test_throws BoundsError m[[13]] + end + + @testset "Logical indexing" begin + m = Kmer{CharAlphabet, 4}("ØÆGD") + @test m[[true, false, true, false]] == Kmer{CharAlphabet, 2}("ØG") + @test m[trues(4)] === m + @test m[falses(4)] === Kmer{CharAlphabet, 0}("") + + @test_throws BoundsError m[[true, false, true, true, true]] + @test_throws BoundsError m[[false, false, true]] + @test_throws BoundsError m[trues(5)] + end +end + +@testset "Modification" begin + @testset "push, push_first" begin + m = mer"UHALSAP"a + @test push(m, AA_W) == mer"UHALSAPW"a + @test push(push(m, AA_W), AA_M) === mer"UHALSAPWM"a + @test push_first(m, AA_Gap) == mer"-UHALSAP"a + @test push_first(push(m, AA_K), AA_H) == mer"HUHALSAPK"a + + @test push(m, 'K') == mer"UHALSAPK"a + @test push(mer"TAG"d, RNA_A) == mer"TAGA"d + end + + @testset "shift, shiftfirst" begin + m = mer"PDOFPOLEF"a + v = collect(m) + for aa in aa"PLLMWFVB" + m = shift(m, aa) + @test m isa Mer{9} + popfirst!(push!(v, aa)) + @test collect(m) == v + end + + m = mer"AUGCGUA"r + v = collect(m) + for dna in dna"TAGTGTGCTA" + m = shift_first(m, dna) + @test m isa Mer{7} + pop!(pushfirst!(v, dna)) + @test collect(m) == v + end + end + + @testset "pop" begin + m = mer"LNPQ"a + @test (m = pop(m)) == mer"LNP"a + @test (m = pop(m)) == mer"LN"a + @test (m = pop(m)) == mer"L"a + @test (m = pop(m)) == AAKmer{0}("") + @test_throws ArgumentError pop(m) + + @test pop(mer"MDFFIJFKL"a) === mer"MDFFIJFK"a + end +end + +@testset "Biological operations" begin + for s in [ + dna"", + aa"", + LongDNA{2}(dna"TAGTGCA"), + LongRNA{2}(rna"UGCUGUAA"), + dna"TGASWKHVAAN--A", + rna"UAGUCUYMNS", + aa"LKHWSYYVQN", + LongSequence{CharAlphabet}("LKDSJ"), + LongSequence{CharAlphabet}("κ𝚶⊸∑Γ"), + ] + m = Kmer{typeof(Alphabet(s)), length(s)}(s) + + # Reverse + @test collect(reverse(m)) == reverse(collect(m)) + @test collect(reverse(m)) == collect(reverse(s)) + + # The rest of the operations are only for nucleotides + isa(Alphabet(s), NucleicAcidAlphabet) || continue + + # Complement + @test collect(complement(s)) == collect(complement(m)) + + # Reverse complement + rv = reverse_complement(m) + @test collect(reverse_complement(s)) == collect(rv) + + # Canonical + can = canonical(m) + @test collect(can) == collect(canonical(s)) + @test can ≤ m + if can === m + @test m ≤ rv + else + @test can === rv + @test rv ≤ m + end + end +end + +@testset "Translation" begin + @testset "Forward translation" begin + # Empty + @test translate(mer""r) == mer""a + @test translate(mer""d) == mer""a + @test translate(Kmer{DNAAlphabet{4}, 0}("")) == mer""a + + # Not divisible by 3 + @test_throws Exception translate(mer"U"r) + @test_throws Exception translate(mer"UGCA"r) + @test_throws Exception translate(mer"GUCGAUUGUC"r) + + # Containing gaps + @test_throws Exception translate(Kmer{DNAAlphabet{4}, 6}("CTGA-C")) + @test_throws Exception translate(Kmer{RNAAlphabet{4}, 3}("UC-")) + + # Invalid alphabet + @test_throws Exception transate(mer"CCC"a) + @test_throws Exception transate(Kmer{CharAlphabet, 3}("GGG")) + + # Compare to LongSequence + for s in [ + rna"UCGUAGUUCGAUUCUAUGCUGUAGUGGCAA", + rna"UCGUAGGCGUAUUGCGCAAAGCGC", + rna"UGCUAGUGUUCGAAA", + rna"UCGUUAGUAAAA", + ] + for A in [DNAAlphabet{4}, RNAAlphabet{2}, DNAAlphabet{2}, RNAAlphabet{4}] + ss = LongSequence{A}(s) + @test collect(translate(ss)) == collect(translate(Kmer{A, length(s)}(s))) + end + end + + for s in [ + rna"UGCUGAWKVUDUGWUGUDHUAGUGCNUBGKUGCMGGSWC", + rna"UCGUAGUCKGUCGUYCUGAGGWUGCUGANNUGCUGA", + rna"CAGGCCAGWGCUGSSSCUGSMGKYVUCUAS", + ] + for A in [DNAAlphabet{4}, RNAAlphabet{4}] + ss = LongSequence{A}(s) + @test collect(translate(ss)) == collect(translate(Kmer{A, length(s)}(s))) + end + end + + # Skip 1, the index of gap (which cannot be translated) + A = alphabet(RNA) + for i in 2:16, j in 2:16, k in 2:16 + mer = Kmer{RNAAlphabet{4}, 3}((A[i], A[j], A[k])) + @test only(translate(mer)) == only(translate(LongSequence(mer))) + end + end + + @testset "CodonSet" begin + codons = [RNACodon((i, j, k)) for i in mer"UACG"r, j in mer"UACG"r, k in mer"UACG"r] + @test length(Set(codons)) == 64 + sources = [[], codons[[1, 4, 8]], codons, codons[rand(Bool, 64)], codons[[4, 8]]] + csets = map(CodonSet, sources) + sets = map(Set, sources) + + # Basic properties + for (cset, set) in zip(csets, sets) + @test cset == set + @test sort!(collect(cset)) == sort!(collect(set)) + @test length(cset) == length(set) + @test isempty(cset) == isempty(set) + for i in set + @test i ∈ cset + end + end + + for (si, ci) in zip(sets, csets), (sj, cj) in zip(sets, csets) + @test issubset(si, sj) == issubset(ci, cj) + for f in [union, setdiff, intersect, symdiff] + @test Set(f(ci, cj)) == f(si, sj) + end + end + end + + @testset "Standard reverse genetic code" begin + seq = LongAA(collect((i for i in alphabet(AminoAcid) if i ∉ (AA_Gap, AA_U, AA_O)))) + codonsets = reverse_translate(seq) + seen_codons = Set{RNACodon}() + for (codonset, aa) in zip(codonsets, seq) + if isambiguous(aa) + bits = zero(compatbits(aa)) + for codon in codonset + bits |= compatbits(only(translate(codon))) + end + # selenocysteine and Pyrrolysine have bits + # 0x00300000. However, translating normal + # codons cannot get these amino acids, + # so we ignore them by masking their bits + @test bits == (compatbits(aa) & 0x000fffff) + else + @test isdisjoint(seen_codons, codonset) + union!(seen_codons, codonset) + for codon in codonset + @test only(translate(codon)) == aa + end + end + end + @test length(seen_codons) == 64 + end + + @testset "Custom reverse genetic code" begin + # TODO! + end end +@testset "Printing" begin + function test_print(s, str) + @test string(s) == str + io = IOBuffer() + print(io, s) + @test String(take!(io)) == str + end + + for s in [ + dna"", + aa"", + LongDNA{2}(dna"TAGTGCA"), + LongRNA{2}(rna"UGCUGUAA"), + dna"TGASWKHVAAN--A", + rna"UAGUCUYMNS", + aa"LKHWSYYVQN", + ] + test_print(s, string(s)) + end +end + +@testset "Iterators" begin + @testset "Forward iteration" begin + @testset "Aliases" begin + @test FwKmers{DNAAlphabet{2}, 3}(dna"TAGA") isa + FwKmers{DNAAlphabet{2}, 3, LongDNA{4}} + @test FwDNAMers{4}(rna"UAGC") isa FwKmers{DNAAlphabet{2}, 4, LongRNA{4}} + @test FwRNAMers{4}(dna"TACA") isa FwKmers{RNAAlphabet{2}, 4, LongDNA{4}} + @test FwAAMers{4}(aa"LKCY") isa FwKmers{AminoAcidAlphabet, 4, LongAA} + end + + @testset "Smaller than K" begin + @test isempty(FwDNAMers{3}(dna"TA")) + @test isempty(FwAAMers{9}(aa"AOPJVPES")) + @test isempty(FwKmers{RNAAlphabet{4}, 6}(dna"ATGGA")) + end + + @testset "Conversible alphabets" begin + for (seqs, alphabets) in [ + ( + [LongDNA{2}("TGATGGCGTAGTA"), LongRNA{2}("UCGUGCUA"), LongDNA{2}("")], + [DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4}], + ), # From two-bit + ( + [dna"TAGTCTGAC", rna"UAGUCGAUUAGGCC"], + [DNAAlphabet{2}, DNAAlphabet{4}, RNAAlphabet{2}, RNAAlphabet{4}], + ), # From four-bit + ] + for seq in seqs, alphabet in alphabets + v1 = collect(FwKmers{alphabet, 3}(seq)) + v2 = [Kmer{alphabet, 3, 1}(seq[i:(i + 2)]) for i in 1:(length(seq) - 2)] + @test v1 == v2 + end + end + for seq in [dna"TGWSNVNTGA", rna"C-GGAU-WSNUCG"] + @test_throws Exception first(FwDNAMers{3}(seq)) + @test_throws Exception first(FwRNAMers{3}(seq)) + end + end + + @testset "Four to two bit" begin + for seq in [dna"TATGCTTCGTAGTCGTCGTTGCTA"] + for seqq in [seq, LongRNA{4}(seq)] + filtered = typeof(seqq)([i for i in seqq if !isambiguous(i)]) + for A in [DNAAlphabet{2}, RNAAlphabet{2}] + v1 = collect(FwKmers{A, 4}(seqq)) + v2 = [ + Kmer{A, 4, 1}(filtered[i:(i + 3)]) for + i in 1:(length(filtered) - 3) + ] + @test v1 == v2 + end + end + end + end + + @testset "From ASCII bytes" begin + str = "TaghWS-TGnADbkWWMSTV" + T = FwKmers{DNAAlphabet{4}, 4} + mers = collect(T(str)) + for source in + [str, view(str, 1:lastindex(str)), codeunits(str), Vector(codeunits(str))] + @test collect(T(source)) == mers + end + end + + # Unconvertible alphabet + # TODO + # Error in the constructor? + #@test_throws FwDNAMers{} + end +end + +# FwRvIterator +# Canonical +# UnambiguousKmers +# SpacedKmers + end # module diff --git a/test/shuffle.jl b/test/shuffle.jl deleted file mode 100644 index 793081b..0000000 --- a/test/shuffle.jl +++ /dev/null @@ -1,25 +0,0 @@ -@testset "Shuffle" begin - for s in ["A", "C", "G", "T"] - kmer = DNAKmer(s) - @test kmer === shuffle(kmer) - end - - function count(kmer) - a = c = g = t = 0 - for x in kmer - a += x == DNA_A - c += x == DNA_C - g += x == DNA_G - t += x == DNA_T - end - return a, c, g, t - end - - for k in 1:64, _ in 1:10 - kmer = rand(DNAKmer{k}) - @test count(kmer) == count(shuffle(kmer)) - if k ≥ 30 - @test kmer != shuffle(kmer) - end - end -end diff --git a/test/transformations.jl b/test/transformations.jl deleted file mode 100644 index 1691688..0000000 --- a/test/transformations.jl +++ /dev/null @@ -1,60 +0,0 @@ -@testset "Transformations" begin - function test_reverse(T, seq) - revseq = reverse(T(seq)) - @test String(revseq) == reverse(seq) - end - - function test_dna_complement(T, seq) - comp = complement(T(seq)) - @test String(comp) == dna_complement(seq) - end - - function test_rna_complement(T, seq) - comp = complement(T(seq)) - @test String(comp) == rna_complement(seq) - end - - function test_dna_revcomp(T, seq) - revcomp = reverse_complement(T(seq)) - @test String(revcomp) == reverse(dna_complement(seq)) - end - - function test_rna_revcomp(T, seq) - revcomp = reverse_complement(T(seq)) - @test String(revcomp) == reverse(rna_complement(seq)) - end - - @testset "Reverse" begin - for len in 1:64, _ in 1:10 - test_reverse(DNAKmer{len}, random_dna_kmer(len)) - test_reverse(RNAKmer{len}, random_rna_kmer(len)) - end - - seq = dna"AAAAAAAAAAAAAAAAAAAAAAAAAAAAGATAC" - @test reverse(seq[(length(seq)-9):length(seq)]) == dna"CATAGAAAAA" - end - - @testset "Complement" begin - for len in 1:64, _ in 1:10 - test_dna_complement(DNAKmer{len}, random_dna_kmer(len)) - test_rna_complement(RNAKmer{len}, random_rna_kmer(len)) - end - end - - @testset "Reverse Complement" begin - for len in 1:64, _ in 1:10 - test_dna_revcomp(DNAKmer{len}, random_dna_kmer(len)) - test_rna_revcomp(RNAKmer{len}, random_rna_kmer(len)) - end - end - - @testset "Canonical" begin - @test canonical(DNAKmer{4,1}("ACCG")) == DNAKmer{4,1}("ACCG") - @test canonical(DNAKmer{4,1}("GCAC")) == DNAKmer{4,1}("GCAC") - @test canonical(RNAKmer{4,1}("AAUU")) == RNAKmer{4,1}("AAUU") - @test canonical(RNAKmer{4,1}("UGGA")) == RNAKmer{4,1}("UCCA") - @test canonical(RNAKmer{4,1}("CGAU")) == RNAKmer{4,1}("AUCG") - @test canonical(RNAKmer{4,1}("UGGA")) == RNAKmer{4,1}("UCCA") - @test canonical(DNAKmer{4,1}("GCAC")) == DNAKmer{4,1}("GCAC") - end -end diff --git a/test/translation.jl b/test/translation.jl index ccf34dc..38d8cdb 100644 --- a/test/translation.jl +++ b/test/translation.jl @@ -1,141 +1,138 @@ @testset "Translation" begin - -sampler = BioSequences.SamplerWeighted( - dna"ACGTMRSVWYHKDBN", - vcat(fill(0.225, 4), fill(0.00909, 10)) -) - -for A in (RNAAlphabet, DNAAlphabet) - for N in (2, 4) - for len in [3, 15, 33, 66] - for alternative in (true, false) - seq = if N == 2 - randseq(A{2}(), len) - else - randseq(A{4}(), sampler, len) + sampler = BioSequences.SamplerWeighted( + dna"ACGTMRSVWYHKDBN", + vcat(fill(0.225, 4), fill(0.00909, 10)), + ) + + for A in (RNAAlphabet, DNAAlphabet) + for N in (2, 4) + for len in [3, 15, 33, 66] + for alternative in (true, false) + seq = if N == 2 + randseq(A{2}(), len) + else + randseq(A{4}(), sampler, len) + end + kmer = Kmer{A{N}}(seq) + @test ( + translate(seq; alternative_start=alternative) == + translate(kmer; alternative_start=alternative) + ) end - kmer = Kmer{A{N}}(seq) - @test ( - translate(seq, alternative_start=alternative) == - translate(kmer, alternative_start=alternative) - ) end end end -end -# Throws when ambiguous -@test_throws Exception translate( - Kmer{RNAAlphabet{4}}("AUGCCGCMA"), - allow_ambiguous_codons=false -) + # Throws when ambiguous + @test_throws Exception translate( + Kmer{RNAAlphabet{4}}("AUGCCGCMA"), + allow_ambiguous_codons=false, + ) + + # Not divisible by 3 + @test_throws Exception translate(mer"UG"r) + @test_throws Exception translate(mer"TAGCTTAA"d) + @test_throws Exception translate(mer"CUGUAGUUGUCGC"r) + @test_throws Exception translate(mer"AGCGA"d) + + # Cannot transla AA seq + @test_throws MethodError translate(mer"LLVM"aa) + @test_throws MethodError translate(mer"ATG"aa) +end # translation + +@testset "CodonSet" begin + CodonSet = Kmers.CodonSet -# Not divisible by 3 -@test_throws Exception translate(mer"UG"r) -@test_throws Exception translate(mer"TAGCTTAA"d) -@test_throws Exception translate(mer"CUGUAGUUGUCGC"r) -@test_throws Exception translate(mer"AGCGA"d) + SAMPLE_SOURCES = Any[ + [mer"UAG"r, mer"ACC"r, mer"ACC"r, mer"UGG"r], + RNACodon[], + [mer"AAA"r, mer"ACC"r, mer"AAA"r, mer"UCA"r, mer"UCC"r], + (i for i in (mer"AGC"r, mer"AGA"r, mer"UUU"r)), + (mer"AAC"r, mer"AGG"r), + (mer"UUG"r,), + ] -# Cannot transla AA seq -@test_throws MethodError translate(mer"LLVM"aa) -@test_throws MethodError translate(mer"ATG"aa) + @testset "Construction and basics" begin + @test isempty(CodonSet()) -end # translation + # Constuct the sets and basic properties + for codons in SAMPLE_SOURCES + set = Set(codons) + codonset = CodonSet(codons) + @test issetequal(set, codonset) + @test length(codonset) == length(set) + end -@testset "CodonSet" begin -CodonSet = Kmers.CodonSet - -SAMPLE_SOURCES = Any[ - [mer"UAG"r, mer"ACC"r, mer"ACC"r, mer"UGG"r], - RNACodon[], - [mer"AAA"r, mer"ACC"r, mer"AAA"r, mer"UCA"r, mer"UCC"r], - (i for i in (mer"AGC"r, mer"AGA"r, mer"UUU"r)), - (mer"AAC"r, mer"AGG"r), - (mer"UUG"r,), -] - -@testset "Construction and basics" begin - @test isempty(CodonSet()) - - # Constuct the sets and basic properties - for codons in SAMPLE_SOURCES - set = Set(codons) - codonset = CodonSet(codons) - @test issetequal(set, codonset) - @test length(codonset) == length(set) + # Fails with non-codons + @test_throws MethodError CodonSet([(RNA_A, RNA_G)]) + @test_throws MethodError CodonSet((mer"UA"r,)) + @test_throws MethodError CodonSet([rna"AGG", rna"GGG"]) + @test_throws MethodError CodonSet([1, 2, 3]) end - # Fails with non-codons - @test_throws MethodError CodonSet([(RNA_A, RNA_G)]) - @test_throws MethodError CodonSet((mer"UA"r,)) - @test_throws MethodError CodonSet([rna"AGG", rna"GGG"]) - @test_throws MethodError CodonSet([1,2,3]) -end + SAMPLE_CODONSETS = map(CodonSet, SAMPLE_SOURCES) -SAMPLE_CODONSETS = map(CodonSet, SAMPLE_SOURCES) + @testset "Iteration" begin + for things in SAMPLE_SOURCES + @test sort!(collect(CodonSet(things))) == sort!(collect(Set(things))) + end -@testset "Iteration" begin - for things in SAMPLE_SOURCES - @test sort!(collect(CodonSet(things))) == sort!(collect(Set(things))) + @test iterate(CodonSet()) === nothing + codonset = CodonSet((mer"UUU"r,)) + codon, state = iterate(codonset) + @test codon == mer"UUU"r + @test iterate(codonset, state) === nothing end - @test iterate(CodonSet()) === nothing - codonset = CodonSet((mer"UUU"r,)) - codon, state = iterate(codonset) - @test codon == mer"UUU"r - @test iterate(codonset, state) === nothing -end - -@testset "Membership" begin - codonset = CodonSet([mer"ACC"r, mer"UAG"r, mer"UUU"r]) - @test mer"ACC"r in codonset - @test mer"UAG"r in codonset - @test mer"UUU"r in codonset - @test !in(mer"GAA"r, codonset) - @test !in(mer"AAA"r, codonset) -end - -@testset "Modifying" begin - # Push - s1 = CodonSet([mer"GGA"r, mer"UGU"r]) - s2 = push(s1, mer"GGA"r) - @test s1 == s2 - s3 = push(s2, mer"GAG"r) - @test Set(s3) == Set([mer"GGA"r, mer"UGU"r, mer"GGA"r, mer"GAG"r]) - - # Delete - s4 = delete(s3, mer"GAG"r) - @test s2 == s4 - s5 = delete(s4, mer"UGU"r) - @test only(s5) == mer"GGA"r - s6 = delete(s5, mer"UUU"r) - @test s5 == s6 - s7 = delete(s6, mer"GGA"r) - @test isempty(s7) -end - -@testset "Set operations" begin - for c1 in SAMPLE_CODONSETS, c2 in SAMPLE_CODONSETS - s1, s2 = Set(c1), Set(c2) - for operation in [union, intersect, setdiff, symdiff] - @test Set(operation(c1, c2)) == operation(s1, s2) - end - @test issubset(c1, c2) == issubset(s1, s2) + @testset "Membership" begin + codonset = CodonSet([mer"ACC"r, mer"UAG"r, mer"UUU"r]) + @test mer"ACC"r in codonset + @test mer"UAG"r in codonset + @test mer"UUU"r in codonset + @test !in(mer"GAA"r, codonset) + @test !in(mer"AAA"r, codonset) end -end - -@testset "Filter" begin - predicates = [ - (i -> i[2] == RNA_G), - (i -> isodd(length(i))), # always true for codons - (i -> i[1] == i[3]), - (i -> i[2] != RNA_A) - ] - for codonset in SAMPLE_CODONSETS, predicate in predicates - @test Set(filter(predicate, codonset)) == filter(predicate, Set(codonset)) + + @testset "Modifying" begin + # Push + s1 = CodonSet([mer"GGA"r, mer"UGU"r]) + s2 = push(s1, mer"GGA"r) + @test s1 == s2 + s3 = push(s2, mer"GAG"r) + @test Set(s3) == Set([mer"GGA"r, mer"UGU"r, mer"GGA"r, mer"GAG"r]) + + # Delete + s4 = delete(s3, mer"GAG"r) + @test s2 == s4 + s5 = delete(s4, mer"UGU"r) + @test only(s5) == mer"GGA"r + s6 = delete(s5, mer"UUU"r) + @test s5 == s6 + s7 = delete(s6, mer"GGA"r) + @test isempty(s7) end -end + @testset "Set operations" begin + for c1 in SAMPLE_CODONSETS, c2 in SAMPLE_CODONSETS + s1, s2 = Set(c1), Set(c2) + for operation in [union, intersect, setdiff, symdiff] + @test Set(operation(c1, c2)) == operation(s1, s2) + end + @test issubset(c1, c2) == issubset(s1, s2) + end + end + + @testset "Filter" begin + predicates = [ + (i -> i[2] == RNA_G), + (i -> isodd(length(i))), # always true for codons + (i -> i[1] == i[3]), + (i -> i[2] != RNA_A), + ] + for codonset in SAMPLE_CODONSETS, predicate in predicates + @test Set(filter(predicate, codonset)) == filter(predicate, Set(codonset)) + end + end end # CodonSet @testset "Reverse translation" begin @@ -143,7 +140,7 @@ end # CodonSet code2 = ReverseGeneticCode(BioSequences.trematode_mitochondrial_genetic_code) for (rvcode, fwcode) in [ (Kmers.rev_standard_genetic_code, BioSequences.standard_genetic_code), - (code2, BioSequences.trematode_mitochondrial_genetic_code) + (code2, BioSequences.trematode_mitochondrial_genetic_code), ] @test reverse_translate(aa"", rvcode) == CodonSet[] observed = Dict{AminoAcid, CodonSet}() @@ -155,7 +152,7 @@ end # CodonSet # Length and iteration of ReverseGeneticCode @test length(rvcode) == length(symbols(AminoAcidAlphabet())) - 1 # all but AA_Gap - @test sort!(collect(rvcode), by=first) == sort!(collect(observed), by=first) + @test sort!(collect(rvcode); by=first) == sort!(collect(observed); by=first) flipped = Dict(v => k for (k, v) in observed) for (codonset, aa) in flipped @@ -181,13 +178,34 @@ end # CodonSet (AA_J, [AA_I, AA_L]), (AA_Z, [AA_E, AA_Q]), (AA_B, [AA_D, AA_N]), - (AA_X, [ - AA_A, AA_R, AA_N, AA_D, AA_C, AA_Q, AA_E, AA_G, AA_H, AA_I, AA_L, AA_K, - AA_M, AA_F, AA_P, AA_S, AA_T, AA_W, AA_Y, AA_V - ]) - ] + ( + AA_X, + [ + AA_A, + AA_R, + AA_N, + AA_D, + AA_C, + AA_Q, + AA_E, + AA_G, + AA_H, + AA_I, + AA_L, + AA_K, + AA_M, + AA_F, + AA_P, + AA_S, + AA_T, + AA_W, + AA_Y, + AA_V, + ], + ), + ] c1 = only(reverse_translate(LongAA([ambig]), rvcode)) - c2 = foldl(elements, init=CodonSet()) do old, aa + c2 = foldl(elements; init=CodonSet()) do old, aa union(old, reverse_translate(aa, rvcode)) end @test c1 == c2 diff --git a/test/utils.jl b/test/utils.jl index a2d74b3..3761510 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -1,5 +1,9 @@ +function random_seq(A::Alphabet, n::Integer) + randseq(A, SamplerUniform(symbols(A)), n) +end + # Return a random DNA/RNA sequence of the given length. -function random_seq(n::Integer, nts, probs, outtype = String) +function random_seq(n::Integer, nts, probs, outtype=String) cumprobs = cumsum(probs) x = Vector{Char}(undef, n) for i in 1:n @@ -8,7 +12,7 @@ function random_seq(n::Integer, nts, probs, outtype = String) return outtype(x) end -function random_seq(::Type{A}, n::Integer) where {A<:Alphabet} +function random_seq(::Type{A}, n::Integer) where {A <: Alphabet} # TODO: Resolve the use of symbols(A()). nts = symbols(A()) probs = Vector{Float64}(undef, length(nts)) @@ -25,10 +29,33 @@ function random_rna(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) end function random_aa(len) - return random_seq(len, - ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', - 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'X' ], - push!(fill(0.049, 20), 0.02)) + return random_seq( + len, + [ + 'A', + 'R', + 'N', + 'D', + 'C', + 'Q', + 'E', + 'G', + 'H', + 'I', + 'L', + 'K', + 'M', + 'F', + 'P', + 'S', + 'T', + 'W', + 'Y', + 'V', + 'X', + ], + push!(fill(0.049, 20), 0.02), + ) end function random_dna_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) @@ -39,13 +66,35 @@ function random_rna_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) return random_seq(n, ['A', 'C', 'G', 'U', 'N'], probs, Vector{RNA}) end -function random_rna_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) - return random_seq(n, ['A', 'C', 'G', 'U', 'N'], probs, Vector{RNA}) -end - function random_aa_symbols(n, probs=[0.24, 0.24, 0.24, 0.24, 0.04]) - return random_seq(n, ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', - 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'X' ], probs, Vector{AminoAcid}) + return random_seq( + n, + [ + 'A', + 'R', + 'N', + 'D', + 'C', + 'Q', + 'E', + 'G', + 'H', + 'I', + 'L', + 'K', + 'M', + 'F', + 'P', + 'S', + 'T', + 'W', + 'Y', + 'V', + 'X', + ], + probs, + Vector{AminoAcid}, + ) end function random_dna_kmer(len) @@ -72,4 +121,4 @@ function rna_complement(seq::AbstractString) seqc[i] = complementer[c] end return String(seqc) -end \ No newline at end of file +end