From d59c5785f12508834dcf21e1e32a6e94a4c384e5 Mon Sep 17 00:00:00 2001 From: Antonio Camargo Date: Wed, 20 Nov 2019 20:40:22 -0300 Subject: [PATCH] Parallelize k-mer frequency computation --- Cargo.toml | 8 ++++- Dockerfile | 2 +- rnasamba/cli.py | 6 ++-- rnasamba/core/inputs.py | 4 +-- rnasamba/core/model.py | 13 +++------ rnasamba/core/sequences.py | 26 ----------------- setup.py | 2 +- src/lib.rs | 60 ++++++++++++++++++++++++++++++++++---- 8 files changed, 72 insertions(+), 49 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 535481a..f4b23b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rnasamba" -version = "0.2.0" +version = "0.2.1" authors = ["Antonio Camargo "] edition = "2018" @@ -8,6 +8,12 @@ edition = "2018" name = "rnasamba" crate-type = ["cdylib"] +[dependencies] +itertools = "0.8.1" +ndarray = "0.13.0" +numpy = "0.7.0" +rayon = "1.2.0" + [dependencies.pyo3] version = "0.8.1" features = ["extension-module"] \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c0d8bdf..98d2544 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ RUN pip install --no-cache-dir \ 'biopython==1.74' \ 'keras==2.2.5' \ 'numpy==1.16.5' \ - 'rnasamba==0.2.0' \ + 'rnasamba==0.2.1' \ 'tensorflow==1.14.0' VOLUME ["/app"] diff --git a/rnasamba/cli.py b/rnasamba/cli.py index 2e6ae52..64c8ff0 100644 --- a/rnasamba/cli.py +++ b/rnasamba/cli.py @@ -48,7 +48,7 @@ def train(args): def classify_cli(parser): - parser.add_argument('--version', action='version', version='%(prog)s 0.2.0') + parser.add_argument('--version', action='version', version='%(prog)s 0.2.1') parser.set_defaults(func=classify) parser.add_argument( 'output_file', @@ -79,7 +79,7 @@ def classify_cli(parser): def train_cli(parser): parser.set_defaults(func=train) - parser.add_argument('--version', action='version', version='%(prog)s 0.2.0') + parser.add_argument('--version', action='version', version='%(prog)s 0.2.1') parser.add_argument( 'output_file', help='output HDF5 file containing weights of the newly trained RNAsamba network.', @@ -128,7 +128,7 @@ def cli(): description='Coding potential calculation using deep learning.', formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.add_argument('--version', action='version', version='%(prog)s 0.2.0') + parser.add_argument('--version', action='version', version='%(prog)s 0.2.1') subparsers = parser.add_subparsers() classify_parser = subparsers.add_parser( 'classify', diff --git a/rnasamba/core/inputs.py b/rnasamba/core/inputs.py index d78660b..3950a10 100644 --- a/rnasamba/core/inputs.py +++ b/rnasamba/core/inputs.py @@ -20,7 +20,7 @@ from keras.preprocessing.sequence import pad_sequences -from rnasamba.core import sequences +from rnasamba.core import sequences, kmer class RNAsambaInput: @@ -73,7 +73,7 @@ def get_nucleotide_input(self): return nucleotide_input def get_kmer_frequency_input(self): - kmer_frequency_input = sequences.kmer_frequency(self._nucleotide_sequences) + kmer_frequency_input = kmer.kmer_frequencies_array(self._nucleotide_sequences) return kmer_frequency_input def get_orf_indicator_input(self): diff --git a/rnasamba/core/model.py b/rnasamba/core/model.py index ee73217..72fc0ad 100644 --- a/rnasamba/core/model.py +++ b/rnasamba/core/model.py @@ -223,11 +223,9 @@ def get_rnasamba_model(maxlen, protein_maxlen): aa_frequency_layer = Input(name='aa_frequency_layer', shape=(21,)) # Nucleotide branch (first branch): - emb_mat_nuc = np.random.normal(0, 1, (5, 4)) + emb_mat_nuc = np.zeros((5, 4)) for i in range(1, 5): - vector = np.zeros(4) - vector[i - 1] = 1 - emb_mat_nuc[i] = vector + emb_mat_nuc[i, i - 1] = 1 embedding_layer_nucleotide = Embedding( input_dim=emb_mat_nuc.shape[0], output_dim=4, @@ -284,12 +282,9 @@ def get_rnasamba_model(maxlen, protein_maxlen): orf_length_branch = Lambda(lambda d: tf.tile(d, [1, 64]))(orf_length_branch) # Protein branch: - emb_mat_prot = np.random.random((22, 21)) - emb_mat_prot[0] = np.zeros(21) + emb_mat_prot = np.zeros((22, 21)) for i in range(1, 22): - vector = np.zeros(21) - vector[i - 1] = 1 - emb_mat_prot[i] = vector + emb_mat_prot[i, i - 1] = 1 embedding_layer_protein = Embedding( input_dim=emb_mat_prot.shape[0], output_dim=21, diff --git a/rnasamba/core/sequences.py b/rnasamba/core/sequences.py index 4765197..0d4e5c3 100644 --- a/rnasamba/core/sequences.py +++ b/rnasamba/core/sequences.py @@ -27,8 +27,6 @@ from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical -from rnasamba.core.kmer import count_kmers - def read_fasta(filename, tokenize=False): seqs = [] @@ -81,30 +79,6 @@ def orf_indicator(orfs, maxlen): return orf_indicator -def kmer_frequency(sequence_tuple, kmer_lengths=[2, 3, 4]): - kmer_frequency = [] - bases = ['A', 'T', 'C', 'G'] - for nucleotide_seq in sequence_tuple: - matches = [bases, bases] - sequence_kmer_frequency = [] - for current_length in kmer_lengths: - current_seq = nucleotide_seq[0] - total_kmers = len(current_seq) - (current_length - 1) - kmer_count = count_kmers(current_seq, current_length) - for match in itertools.product(*matches): - current_kmer = ''.join(match) - if current_kmer in kmer_count: - sequence_kmer_frequency.append( - kmer_count[current_kmer] / (total_kmers) - ) - else: - sequence_kmer_frequency.append(0) - matches.append(bases) - kmer_frequency.append(sequence_kmer_frequency) - kmer_frequency = np.stack(kmer_frequency) - return kmer_frequency - - def aa_frequency(aa_dict, orfs): aa_numeric = list(aa_dict.values()) aa_numeric.sort() diff --git a/setup.py b/setup.py index 1e807a1..63d134e 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ setup( name='rnasamba', - version='0.2.0', + version='0.2.1', packages=find_packages(), rust_extensions=[RustExtension('rnasamba.core.kmer', debug=False)], zip_safe=False, diff --git a/src/lib.rs b/src/lib.rs index e19991e..73a5e75 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,20 +1,68 @@ -use pyo3::prelude::*; -use pyo3::wrap_pyfunction; +use itertools::iproduct; +use ndarray::Array2; +use numpy::{convert::ToPyArray, PyArray2}; +use pyo3::{prelude::*, wrap_pyfunction}; +use rayon::prelude::*; use std::collections::HashMap; -#[pyfunction] -fn count_kmers(sequence: &str, k: usize) -> PyResult> { +fn sequence_kmer_counts(sequence: &str, k: usize) -> HashMap<&str, u16> { let mut counts = HashMap::new(); let n_kmers = sequence.len() - k + 1; for i in 0..n_kmers { let kmer = &sequence[i..i + k]; *counts.entry(kmer).or_insert(0) += 1; } - Ok(counts) + counts +} + +fn kmer_generator(alphabet: String, k: usize) -> Vec { + match k { + 0 => vec![], + 1 => alphabet.chars().map(|c| c.to_string()).collect(), + 2 => iproduct!(alphabet.chars(), alphabet.chars()) + .map(|(a, b)| format!("{}{}", a, b)) + .collect(), + _ => iproduct!(kmer_generator(alphabet.clone(), k - 1), alphabet.chars()) + .map(|(a, b)| format!("{}{}", a, b)) + .collect(), + } +} + +fn sequence_kmer_frequencies(sequence: &str) -> Vec { + let mut kmer_frequency_array: Vec = Vec::new(); + for k in 2..5 { + let sequence_total_kmers = sequence.len() - k + 1; + let sequence_kmer_count = sequence_kmer_counts(sequence, k); + for kmer in kmer_generator(String::from("ATCG"), k).into_iter() { + let kmer_count: u16; + match sequence_kmer_count.get(&kmer[..]) { + Some(n) => kmer_count = *n, + _ => kmer_count = 0, + }; + let kmer_frequency = kmer_count as f32 / sequence_total_kmers as f32; + kmer_frequency_array.push(kmer_frequency); + } + } + kmer_frequency_array +} + +#[pyfunction] +fn kmer_frequencies_array(sequences: Vec<(&str, &str)>) -> Py> { + Array2::from_shape_vec( + (sequences.len(), 336), + sequences + .par_iter() + .map(|sequence| sequence_kmer_frequencies(sequence.0)) + .flatten() + .collect(), + ) + .unwrap() + .to_pyarray(Python::acquire_gil().python()) + .to_owned() } #[pymodule] fn kmer(_py: Python<'_>, m: &PyModule) -> PyResult<()> { - m.add_wrapped(wrap_pyfunction!(count_kmers))?; + m.add_wrapped(wrap_pyfunction!(kmer_frequencies_array))?; Ok(()) }