Skip to content

Commit

Permalink
Parallelize k-mer frequency computation
Browse files Browse the repository at this point in the history
  • Loading branch information
apcamargo committed Nov 20, 2019
1 parent 9d9a695 commit d59c578
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 49 deletions.
8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
[package]
name = "rnasamba"
version = "0.2.0"
version = "0.2.1"
authors = ["Antonio Camargo <[email protected]>"]
edition = "2018"

[lib]
name = "rnasamba"
crate-type = ["cdylib"]

[dependencies]
itertools = "0.8.1"
ndarray = "0.13.0"
numpy = "0.7.0"
rayon = "1.2.0"

[dependencies.pyo3]
version = "0.8.1"
features = ["extension-module"]
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ RUN pip install --no-cache-dir \
'biopython==1.74' \
'keras==2.2.5' \
'numpy==1.16.5' \
'rnasamba==0.2.0' \
'rnasamba==0.2.1' \
'tensorflow==1.14.0'

VOLUME ["/app"]
Expand Down
6 changes: 3 additions & 3 deletions rnasamba/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def train(args):


def classify_cli(parser):
parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
parser.add_argument('--version', action='version', version='%(prog)s 0.2.1')
parser.set_defaults(func=classify)
parser.add_argument(
'output_file',
Expand Down Expand Up @@ -79,7 +79,7 @@ def classify_cli(parser):

def train_cli(parser):
parser.set_defaults(func=train)
parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
parser.add_argument('--version', action='version', version='%(prog)s 0.2.1')
parser.add_argument(
'output_file',
help='output HDF5 file containing weights of the newly trained RNAsamba network.',
Expand Down Expand Up @@ -128,7 +128,7 @@ def cli():
description='Coding potential calculation using deep learning.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument('--version', action='version', version='%(prog)s 0.2.0')
parser.add_argument('--version', action='version', version='%(prog)s 0.2.1')
subparsers = parser.add_subparsers()
classify_parser = subparsers.add_parser(
'classify',
Expand Down
4 changes: 2 additions & 2 deletions rnasamba/core/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from keras.preprocessing.sequence import pad_sequences

from rnasamba.core import sequences
from rnasamba.core import sequences, kmer


class RNAsambaInput:
Expand Down Expand Up @@ -73,7 +73,7 @@ def get_nucleotide_input(self):
return nucleotide_input

def get_kmer_frequency_input(self):
kmer_frequency_input = sequences.kmer_frequency(self._nucleotide_sequences)
kmer_frequency_input = kmer.kmer_frequencies_array(self._nucleotide_sequences)
return kmer_frequency_input

def get_orf_indicator_input(self):
Expand Down
13 changes: 4 additions & 9 deletions rnasamba/core/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,9 @@ def get_rnasamba_model(maxlen, protein_maxlen):
aa_frequency_layer = Input(name='aa_frequency_layer', shape=(21,))

# Nucleotide branch (first branch):
emb_mat_nuc = np.random.normal(0, 1, (5, 4))
emb_mat_nuc = np.zeros((5, 4))
for i in range(1, 5):
vector = np.zeros(4)
vector[i - 1] = 1
emb_mat_nuc[i] = vector
emb_mat_nuc[i, i - 1] = 1
embedding_layer_nucleotide = Embedding(
input_dim=emb_mat_nuc.shape[0],
output_dim=4,
Expand Down Expand Up @@ -284,12 +282,9 @@ def get_rnasamba_model(maxlen, protein_maxlen):
orf_length_branch = Lambda(lambda d: tf.tile(d, [1, 64]))(orf_length_branch)

# Protein branch:
emb_mat_prot = np.random.random((22, 21))
emb_mat_prot[0] = np.zeros(21)
emb_mat_prot = np.zeros((22, 21))
for i in range(1, 22):
vector = np.zeros(21)
vector[i - 1] = 1
emb_mat_prot[i] = vector
emb_mat_prot[i, i - 1] = 1
embedding_layer_protein = Embedding(
input_dim=emb_mat_prot.shape[0],
output_dim=21,
Expand Down
26 changes: 0 additions & 26 deletions rnasamba/core/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from rnasamba.core.kmer import count_kmers


def read_fasta(filename, tokenize=False):
seqs = []
Expand Down Expand Up @@ -81,30 +79,6 @@ def orf_indicator(orfs, maxlen):
return orf_indicator


def kmer_frequency(sequence_tuple, kmer_lengths=[2, 3, 4]):
kmer_frequency = []
bases = ['A', 'T', 'C', 'G']
for nucleotide_seq in sequence_tuple:
matches = [bases, bases]
sequence_kmer_frequency = []
for current_length in kmer_lengths:
current_seq = nucleotide_seq[0]
total_kmers = len(current_seq) - (current_length - 1)
kmer_count = count_kmers(current_seq, current_length)
for match in itertools.product(*matches):
current_kmer = ''.join(match)
if current_kmer in kmer_count:
sequence_kmer_frequency.append(
kmer_count[current_kmer] / (total_kmers)
)
else:
sequence_kmer_frequency.append(0)
matches.append(bases)
kmer_frequency.append(sequence_kmer_frequency)
kmer_frequency = np.stack(kmer_frequency)
return kmer_frequency


def aa_frequency(aa_dict, orfs):
aa_numeric = list(aa_dict.values())
aa_numeric.sort()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

setup(
name='rnasamba',
version='0.2.0',
version='0.2.1',
packages=find_packages(),
rust_extensions=[RustExtension('rnasamba.core.kmer', debug=False)],
zip_safe=False,
Expand Down
60 changes: 54 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,68 @@
use pyo3::prelude::*;
use pyo3::wrap_pyfunction;
use itertools::iproduct;
use ndarray::Array2;
use numpy::{convert::ToPyArray, PyArray2};
use pyo3::{prelude::*, wrap_pyfunction};
use rayon::prelude::*;
use std::collections::HashMap;

#[pyfunction]
fn count_kmers(sequence: &str, k: usize) -> PyResult<HashMap<&str, u16>> {
fn sequence_kmer_counts(sequence: &str, k: usize) -> HashMap<&str, u16> {
let mut counts = HashMap::new();
let n_kmers = sequence.len() - k + 1;
for i in 0..n_kmers {
let kmer = &sequence[i..i + k];
*counts.entry(kmer).or_insert(0) += 1;
}
Ok(counts)
counts
}

fn kmer_generator(alphabet: String, k: usize) -> Vec<String> {
match k {
0 => vec![],
1 => alphabet.chars().map(|c| c.to_string()).collect(),
2 => iproduct!(alphabet.chars(), alphabet.chars())
.map(|(a, b)| format!("{}{}", a, b))
.collect(),
_ => iproduct!(kmer_generator(alphabet.clone(), k - 1), alphabet.chars())
.map(|(a, b)| format!("{}{}", a, b))
.collect(),
}
}

fn sequence_kmer_frequencies(sequence: &str) -> Vec<f32> {
let mut kmer_frequency_array: Vec<f32> = Vec::new();
for k in 2..5 {
let sequence_total_kmers = sequence.len() - k + 1;
let sequence_kmer_count = sequence_kmer_counts(sequence, k);
for kmer in kmer_generator(String::from("ATCG"), k).into_iter() {
let kmer_count: u16;
match sequence_kmer_count.get(&kmer[..]) {
Some(n) => kmer_count = *n,
_ => kmer_count = 0,
};
let kmer_frequency = kmer_count as f32 / sequence_total_kmers as f32;
kmer_frequency_array.push(kmer_frequency);
}
}
kmer_frequency_array
}

#[pyfunction]
fn kmer_frequencies_array(sequences: Vec<(&str, &str)>) -> Py<PyArray2<f32>> {
Array2::from_shape_vec(
(sequences.len(), 336),
sequences
.par_iter()
.map(|sequence| sequence_kmer_frequencies(sequence.0))
.flatten()
.collect(),
)
.unwrap()
.to_pyarray(Python::acquire_gil().python())
.to_owned()
}

#[pymodule]
fn kmer(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(count_kmers))?;
m.add_wrapped(wrap_pyfunction!(kmer_frequencies_array))?;
Ok(())
}

0 comments on commit d59c578

Please sign in to comment.