diff --git a/Cargo.toml b/Cargo.toml index 125d9a0..3bad3a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rnasamba" -version = "0.2.2" +version = "0.2.3" authors = ["Antonio Camargo "] edition = "2018" @@ -12,9 +12,9 @@ crate-type = ["cdylib"] itertools = "0.8.1" ndarray = "0.13.0" numpy = "0.7.0" -rayon = "1.2.0" +rayon = "1.2.1" regex = "1.3.1" [dependencies.pyo3] -version = "0.8.1" +version = "0.8.2" features = ["extension-module"] \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 8a2fc74..295f6a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ RUN pip install --no-cache-dir \ 'biopython==1.74' \ 'keras==2.2.5' \ 'numpy==1.16.5' \ - 'rnasamba==0.2.2' \ + 'rnasamba==0.2.3' \ 'tensorflow==1.14.0' VOLUME ["/app"] diff --git a/rnasamba/cli.py b/rnasamba/cli.py index 0230689..a34f885 100644 --- a/rnasamba/cli.py +++ b/rnasamba/cli.py @@ -48,7 +48,7 @@ def train(args): def classify_cli(parser): - parser.add_argument('--version', action='version', version='%(prog)s 0.2.2') + parser.add_argument('--version', action='version', version='%(prog)s 0.2.3') parser.set_defaults(func=classify) parser.add_argument( 'output_file', @@ -79,7 +79,7 @@ def classify_cli(parser): def train_cli(parser): parser.set_defaults(func=train) - parser.add_argument('--version', action='version', version='%(prog)s 0.2.2') + parser.add_argument('--version', action='version', version='%(prog)s 0.2.3') parser.add_argument( 'output_file', help='output HDF5 file containing weights of the newly trained RNAsamba network.', @@ -128,7 +128,7 @@ def cli(): description='Coding potential calculation using deep learning.', formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.add_argument('--version', action='version', version='%(prog)s 0.2.2') + parser.add_argument('--version', action='version', version='%(prog)s 0.2.3') subparsers = parser.add_subparsers() classify_parser = subparsers.add_parser( 'classify', diff --git a/rnasamba/core/inputs.py b/rnasamba/core/inputs.py index 20897bc..247d56a 100644 --- a/rnasamba/core/inputs.py +++ b/rnasamba/core/inputs.py @@ -24,8 +24,9 @@ class RNAsambaInput: def __init__(self, fasta_file, maxlen=3000): - self._tokenized_sequences = sequences.read_fasta(fasta_file, tokenize=True) - self._nucleotide_sequences = sequences.read_fasta(fasta_file, tokenize=False) + self._nucleotide_seqs, self._token_seqs, self.seqs_names = sequences.read_fasta( + fasta_file + ) self._aa_dict = { 'A': 4, 'C': 18, @@ -58,21 +59,19 @@ def __init__(self, fasta_file, maxlen=3000): self.orf_indicator_input = self.get_orf_indicator_input() self.protein_input = self.get_protein_input() self.aa_frequency_input = self.get_aa_frequency_input() - self.sequence_name = [seq[1] for seq in self._nucleotide_sequences] def get_orfs(self): - orfs = orf.longest_orf_array(self._nucleotide_sequences) + orfs = orf.longest_orf_array(self._nucleotide_seqs) return orfs def get_nucleotide_input(self): - nucleotide_input = [i[0] for i in self._tokenized_sequences] nucleotide_input = pad_sequences( - nucleotide_input, padding='post', maxlen=self.maxlen + self._token_seqs, padding='post', maxlen=self.maxlen ) return nucleotide_input def get_kmer_frequency_input(self): - kmer_frequency_input = kmer.kmer_frequencies_array(self._nucleotide_sequences) + kmer_frequency_input = kmer.kmer_frequencies_array(self._nucleotide_seqs) return kmer_frequency_input def get_orf_indicator_input(self): diff --git a/rnasamba/core/model.py b/rnasamba/core/model.py index 72fc0ad..a777e92 100644 --- a/rnasamba/core/model.py +++ b/rnasamba/core/model.py @@ -48,7 +48,7 @@ def __init__(self, fasta_file, weights, verbose=0): self.input = RNAsambaInput(fasta_file) self.maxlen = self.input.maxlen self.protein_maxlen = self.input.protein_maxlen - self.sequence_name = self.input.sequence_name + self.seqs_names = self.input.seqs_names self.protein_seqs = self.input.protein_seqs self.input_dict = { 'nucleotide_layer': self.input.nucleotide_input, @@ -97,7 +97,7 @@ def write_classification_output(self, output_file): with open(output_file, 'w') as handle: handle.write('sequence_name\tcoding_score\tclassification\n') for i in range(len(self.classification_label)): - handle.write(self.sequence_name[i]) + handle.write(self.seqs_names[i]) handle.write('\t') handle.write('{:.5f}'.format(self.coding_score[i])) handle.write('\t') @@ -110,7 +110,7 @@ def output_protein_fasta(self, protein_fasta): if self.classification_label[i] == 'coding': if self.protein_seqs[i]: handle.write('>') - handle.write(self.sequence_name[i]) + handle.write(self.seqs_names[i]) handle.write('\n') handle.write(self.protein_seqs[i]) handle.write('\n') @@ -140,10 +140,7 @@ def __init__( self.protein_maxlen = self.coding_input.protein_maxlen self.labels = np.repeat( [[0, 1], [1, 0]], - [ - len(self.coding_input.sequence_name), - len(self.noncoding_input.sequence_name), - ], + [len(self.coding_input.seqs_names), len(self.noncoding_input.seqs_names)], axis=0, ) self.input_dict = { diff --git a/rnasamba/core/sequences.py b/rnasamba/core/sequences.py index ada5cf9..c6fd747 100644 --- a/rnasamba/core/sequences.py +++ b/rnasamba/core/sequences.py @@ -26,24 +26,22 @@ from keras.utils import to_categorical -def read_fasta(filename, tokenize=False): +def read_fasta(filename): seqs = [] + seqs_tokenized = [] + seqs_names = [] with open(filename) as handle: - if tokenize: - for record in SeqIO.parse(handle, 'fasta'): - sequence_str = str(record.seq).upper().replace('U', 'T') - sequence_name = record.description - seqs.append((tokenize_dna(sequence_str), sequence_name)) - else: - for record in SeqIO.parse(handle, 'fasta'): - sequence_str = str(record.seq).upper().replace('U', 'T') - sequence_name = record.description - seqs.append((sequence_str, sequence_name)) - return seqs + for record in SeqIO.parse(handle, 'fasta'): + sequence_str = str(record.seq).upper().replace('U', 'T') + sequence_name = record.description + seqs.append(sequence_str) + seqs_tokenized.append(tokenize_dna(sequence_str)) + seqs_names.append(sequence_name) + return seqs, seqs_tokenized, seqs_names def tokenize_dna(sequence): - lookup = dict(zip('NATCG', range(5))) + lookup = {'N': 0, 'A': 1, 'T': 2, 'C': 3, 'G': 4} if not sequence: token = [0] else: @@ -65,8 +63,7 @@ def orf_indicator(orfs, maxlen): def aa_frequency(aa_dict, orfs): - aa_numeric = list(aa_dict.values()) - aa_numeric.sort() + aa_numeric = list(range(1, 22)) aa_frequency = [] for orf in orfs: protein_seq = orf[2] diff --git a/setup.py b/setup.py index 8367fc6..f2358ec 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ setup( name='rnasamba', - version='0.2.2', + version='0.2.3', packages=find_packages(), rust_extensions=[ RustExtension('rnasamba.core.kmer', debug=False), diff --git a/src/lib.rs b/src/lib.rs index d7da7db..1370afe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -81,10 +81,10 @@ fn sequence_longest_orf(sequence: &str) -> (usize, usize, String) { } #[pyfunction] -fn longest_orf_array(sequences: Vec<(&str, &str)>) -> PyResult> { +fn longest_orf_array(sequences: Vec<&str>) -> PyResult> { Ok(sequences .par_iter() - .map(|sequence| sequence_longest_orf(sequence.0)) + .map(|sequence| sequence_longest_orf(sequence)) .collect()) } @@ -136,12 +136,12 @@ fn sequence_kmer_frequencies(sequence: &str) -> Vec { } #[pyfunction] -fn kmer_frequencies_array(sequences: Vec<(&str, &str)>) -> Py> { +fn kmer_frequencies_array(sequences: Vec<&str>) -> Py> { Array2::from_shape_vec( (sequences.len(), 336), sequences .par_iter() - .map(|sequence| sequence_kmer_frequencies(sequence.0)) + .map(|sequence| sequence_kmer_frequencies(sequence)) .flatten() .collect(), )