Skip to content

Commit

Permalink
Version 0.2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
apcamargo committed Nov 22, 2019
1 parent 65be448 commit cf1c0e2
Show file tree
Hide file tree
Showing 8 changed files with 34 additions and 41 deletions.
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "rnasamba"
version = "0.2.2"
version = "0.2.3"
authors = ["Antonio Camargo <[email protected]>"]
edition = "2018"

Expand All @@ -12,9 +12,9 @@ crate-type = ["cdylib"]
itertools = "0.8.1"
ndarray = "0.13.0"
numpy = "0.7.0"
rayon = "1.2.0"
rayon = "1.2.1"
regex = "1.3.1"

[dependencies.pyo3]
version = "0.8.1"
version = "0.8.2"
features = ["extension-module"]
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ RUN pip install --no-cache-dir \
'biopython==1.74' \
'keras==2.2.5' \
'numpy==1.16.5' \
'rnasamba==0.2.2' \
'rnasamba==0.2.3' \
'tensorflow==1.14.0'

VOLUME ["/app"]
Expand Down
6 changes: 3 additions & 3 deletions rnasamba/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def train(args):


def classify_cli(parser):
parser.add_argument('--version', action='version', version='%(prog)s 0.2.2')
parser.add_argument('--version', action='version', version='%(prog)s 0.2.3')
parser.set_defaults(func=classify)
parser.add_argument(
'output_file',
Expand Down Expand Up @@ -79,7 +79,7 @@ def classify_cli(parser):

def train_cli(parser):
parser.set_defaults(func=train)
parser.add_argument('--version', action='version', version='%(prog)s 0.2.2')
parser.add_argument('--version', action='version', version='%(prog)s 0.2.3')
parser.add_argument(
'output_file',
help='output HDF5 file containing weights of the newly trained RNAsamba network.',
Expand Down Expand Up @@ -128,7 +128,7 @@ def cli():
description='Coding potential calculation using deep learning.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument('--version', action='version', version='%(prog)s 0.2.2')
parser.add_argument('--version', action='version', version='%(prog)s 0.2.3')
subparsers = parser.add_subparsers()
classify_parser = subparsers.add_parser(
'classify',
Expand Down
13 changes: 6 additions & 7 deletions rnasamba/core/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@

class RNAsambaInput:
def __init__(self, fasta_file, maxlen=3000):
self._tokenized_sequences = sequences.read_fasta(fasta_file, tokenize=True)
self._nucleotide_sequences = sequences.read_fasta(fasta_file, tokenize=False)
self._nucleotide_seqs, self._token_seqs, self.seqs_names = sequences.read_fasta(
fasta_file
)
self._aa_dict = {
'A': 4,
'C': 18,
Expand Down Expand Up @@ -58,21 +59,19 @@ def __init__(self, fasta_file, maxlen=3000):
self.orf_indicator_input = self.get_orf_indicator_input()
self.protein_input = self.get_protein_input()
self.aa_frequency_input = self.get_aa_frequency_input()
self.sequence_name = [seq[1] for seq in self._nucleotide_sequences]

def get_orfs(self):
orfs = orf.longest_orf_array(self._nucleotide_sequences)
orfs = orf.longest_orf_array(self._nucleotide_seqs)
return orfs

def get_nucleotide_input(self):
nucleotide_input = [i[0] for i in self._tokenized_sequences]
nucleotide_input = pad_sequences(
nucleotide_input, padding='post', maxlen=self.maxlen
self._token_seqs, padding='post', maxlen=self.maxlen
)
return nucleotide_input

def get_kmer_frequency_input(self):
kmer_frequency_input = kmer.kmer_frequencies_array(self._nucleotide_sequences)
kmer_frequency_input = kmer.kmer_frequencies_array(self._nucleotide_seqs)
return kmer_frequency_input

def get_orf_indicator_input(self):
Expand Down
11 changes: 4 additions & 7 deletions rnasamba/core/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self, fasta_file, weights, verbose=0):
self.input = RNAsambaInput(fasta_file)
self.maxlen = self.input.maxlen
self.protein_maxlen = self.input.protein_maxlen
self.sequence_name = self.input.sequence_name
self.seqs_names = self.input.seqs_names
self.protein_seqs = self.input.protein_seqs
self.input_dict = {
'nucleotide_layer': self.input.nucleotide_input,
Expand Down Expand Up @@ -97,7 +97,7 @@ def write_classification_output(self, output_file):
with open(output_file, 'w') as handle:
handle.write('sequence_name\tcoding_score\tclassification\n')
for i in range(len(self.classification_label)):
handle.write(self.sequence_name[i])
handle.write(self.seqs_names[i])
handle.write('\t')
handle.write('{:.5f}'.format(self.coding_score[i]))
handle.write('\t')
Expand All @@ -110,7 +110,7 @@ def output_protein_fasta(self, protein_fasta):
if self.classification_label[i] == 'coding':
if self.protein_seqs[i]:
handle.write('>')
handle.write(self.sequence_name[i])
handle.write(self.seqs_names[i])
handle.write('\n')
handle.write(self.protein_seqs[i])
handle.write('\n')
Expand Down Expand Up @@ -140,10 +140,7 @@ def __init__(
self.protein_maxlen = self.coding_input.protein_maxlen
self.labels = np.repeat(
[[0, 1], [1, 0]],
[
len(self.coding_input.sequence_name),
len(self.noncoding_input.sequence_name),
],
[len(self.coding_input.seqs_names), len(self.noncoding_input.seqs_names)],
axis=0,
)
self.input_dict = {
Expand Down
27 changes: 12 additions & 15 deletions rnasamba/core/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,22 @@
from keras.utils import to_categorical


def read_fasta(filename, tokenize=False):
def read_fasta(filename):
seqs = []
seqs_tokenized = []
seqs_names = []
with open(filename) as handle:
if tokenize:
for record in SeqIO.parse(handle, 'fasta'):
sequence_str = str(record.seq).upper().replace('U', 'T')
sequence_name = record.description
seqs.append((tokenize_dna(sequence_str), sequence_name))
else:
for record in SeqIO.parse(handle, 'fasta'):
sequence_str = str(record.seq).upper().replace('U', 'T')
sequence_name = record.description
seqs.append((sequence_str, sequence_name))
return seqs
for record in SeqIO.parse(handle, 'fasta'):
sequence_str = str(record.seq).upper().replace('U', 'T')
sequence_name = record.description
seqs.append(sequence_str)
seqs_tokenized.append(tokenize_dna(sequence_str))
seqs_names.append(sequence_name)
return seqs, seqs_tokenized, seqs_names


def tokenize_dna(sequence):
lookup = dict(zip('NATCG', range(5)))
lookup = {'N': 0, 'A': 1, 'T': 2, 'C': 3, 'G': 4}
if not sequence:
token = [0]
else:
Expand All @@ -65,8 +63,7 @@ def orf_indicator(orfs, maxlen):


def aa_frequency(aa_dict, orfs):
aa_numeric = list(aa_dict.values())
aa_numeric.sort()
aa_numeric = list(range(1, 22))
aa_frequency = []
for orf in orfs:
protein_seq = orf[2]
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

setup(
name='rnasamba',
version='0.2.2',
version='0.2.3',
packages=find_packages(),
rust_extensions=[
RustExtension('rnasamba.core.kmer', debug=False),
Expand Down
8 changes: 4 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ fn sequence_longest_orf(sequence: &str) -> (usize, usize, String) {
}

#[pyfunction]
fn longest_orf_array(sequences: Vec<(&str, &str)>) -> PyResult<Vec<(usize, usize, String)>> {
fn longest_orf_array(sequences: Vec<&str>) -> PyResult<Vec<(usize, usize, String)>> {
Ok(sequences
.par_iter()
.map(|sequence| sequence_longest_orf(sequence.0))
.map(|sequence| sequence_longest_orf(sequence))
.collect())
}

Expand Down Expand Up @@ -136,12 +136,12 @@ fn sequence_kmer_frequencies(sequence: &str) -> Vec<f32> {
}

#[pyfunction]
fn kmer_frequencies_array(sequences: Vec<(&str, &str)>) -> Py<PyArray2<f32>> {
fn kmer_frequencies_array(sequences: Vec<&str>) -> Py<PyArray2<f32>> {
Array2::from_shape_vec(
(sequences.len(), 336),
sequences
.par_iter()
.map(|sequence| sequence_kmer_frequencies(sequence.0))
.map(|sequence| sequence_kmer_frequencies(sequence))
.flatten()
.collect(),
)
Expand Down

0 comments on commit cf1c0e2

Please sign in to comment.