-
Notifications
You must be signed in to change notification settings - Fork 0
/
pre-processing.py
197 lines (167 loc) · 7.49 KB
/
pre-processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
import numpy as np
import pandas as pd
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from collections import Counter
from pathlib import Path
import os
#########################################
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
#########################################
path1 = "/home/alekchiri/files/subHomo_sapiens.GRCh38.cdna_mod.fa"
path2 = "/home/alekchiri/files/sublncipedia_5_2.fasta"
path3 = "/home/alekchiri/files/humrep.fa"
path4 = "/home/alekchiri/files/sub_defam.fa"
def seq_finder(file):
"""
Parses a FASTA file and extracts sequences with their IDs.
This function reads a FASTA file and extracts the sequence IDs and the
corresponding nucleotide sequences as strings. The sequences are returned
as a list of tuples, where each tuple contains a sequence ID and the
sequence itself.
Parameters:
file (str): The path to the input FASTA file.
Returns:
List[Tuple[str, str]]: A list of tuples where each tuple contains a
sequence ID and its corresponding sequence.
"""
sequences = list()
for sequence in SeqIO.parse(file, "fasta"):
sequences.append((sequence.id, str(sequence.seq)))
return sequences
def one_hot(seq):
"""
Encode a DNA sequence into a one-hot representation.
This function takes a string representing a nucleotide sequence, where each nucleotide is
represented by 'A', 'C', 'G', 'T'. Nucleotides that are not recognized or ambiguous are
represented by 'N'. Each nucleotide is converted into a list of four elements, where each
element corresponds to 'A', 'C', 'G', 'T', respectively. The nucleotide present in the sequence
is represented by a 1 in the list, and the others by 0.
Parameters:
seq (str): The DNA sequence to be encoded. The sequence should be a string composed of the characters
'A', 'C', 'G', 'T', and optionally 'N' for any ambiguous or unknown nucleotides.
Returns:
list of list of int: The encoded list where each nucleotide is represented by a one-hot encoded list.
For example, 'A' is encoded as [1, 0, 0, 0], 'C' as [0, 1, 0, 0], etc.
Example:
>>> one_hot('ACGTN')
[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 0]]
"""
mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1], 'N': [0, 0, 0, 0]}
seq_encoded = list()
for nuc in seq.upper():
if nuc in mapping:
seq_encoded.append(mapping[nuc])
else:
seq_encoded.append(mapping['N'])
return seq_encoded
def fragmentor(sequence, id='', maxseq=75, overlap=275, max_gap=5):
"""
Splits a sequence into fragments with the specified overlap and size.
This function only fragments sequences longer than maxseq.
Parameters:
sequence (str): The input sequence to fragment.
id (str): Identifier to associate with each fragment.
maxseq (int): Maximum length of each fragment.
overlap (int): Overlap between consecutive fragments.
max_gap (int): Maximum percentage of gap '-' allowed in a fragment.
Returns:
list: A list of tuples where each tuple contains a fragment and its identifier.
"""
fragments = list()
step_size = maxseq - overlap
if len(sequence) <= maxseq:
return fragments
else:
for i in range(0, len(sequence), step_size):
fragment = sequence[i:i + maxseq]
if len(fragment) < maxseq:
fragment += 'N' * (maxseq - len(fragment))
num_ = fragment.count('N')
if ( num_ * 100 / maxseq) < max_gap:
fragments.append((fragment,id))
return fragments
def preparing_seq(file, maxseq):
"""
Prepares and processes sequences from a given file.
This function reads sequences from a file, fragments longer sequences
using the `fragmentor` function, and pads shorter sequences to ensure
uniform sequence length. It also labels the sequences based on the
prefix of the sequence ID.
Parameters:
file (str): The path to the file containing sequences in FASTA format.
maxseq (int): The maximum sequence length. Sequences longer than this
value will be fragmented, and shorter ones will be padded.
Returns:
tuple: A tuple containing:
- res (list): A list of tuples where each tuple contains a sequence
and its associated ID.
- labels (list): A list of labels assigned to each sequence based
on its ID prefix. The labels are:
- "protein_coding": for IDs starting with "ENS"
- "lncRNA": for IDs starting with "LIN"
- "rep": for all other IDs
Raises:
ValueError: If the lengths of `res` and `labels` lists are not the same.
"""
res = list()
labels = list()
sequences = seq_finder(file)
for id, sequence in sequences:
if len(sequence) > maxseq:
fragments = fragmentor(sequence,id=id,maxseq=maxseq)
res.extend(fragments)
elif len(sequence) < maxseq:
padded_sequence = sequence + "N" * (maxseq - len(sequence))
res.append((padded_sequence, id))
else:
res.append((sequence, id))
return res
path_test = "/home/aladdine_lekchiri/Téléchargements/DB/fasta_test.fa"
all_sequences = list()
all_labels = list()
sequence_ens,labels_ens = preparing_seq(path1,550)
labels_ens = ["protein_coding" for _ in sequence_ens]
sequence_lnc = preparing_seq(path2,550)
labels_lnc = ["lncRNA" for _ in sequence_lnc]
sequence_repbase = preparing_seq(path3,550)
labels_repbase = ["rep" for _ in sequence_repbase]
sequence_defam = preparing_seq(path4,550)
labels_defam = ["rep" for _ in sequence_defam]
sequence_rep = sequence_repbase + sequence_defam
labels_rep = labels_repbase + labels_defam
print(f"mra : {len(sequence_ens)}, lncrna : {len(sequence_lnc)}, repbase : {len(sequence_rep)}")
print("Pre_processing done !")
all_sequences = sequence_ens + sequence_lnc + sequence_rep
all_labels = labels_ens + labels_lnc + labels_rep
seq_one_hot_code = list()
for seq, id in all_sequences:
encoded = one_hot(seq)
seq_one_hot_code.append(encoded)
def convert_labels(labels):
"""
Converts a list of textual class labels into numerical labels.
This function maps each text label from a list to a corresponding integer based on a predefined dictionary.
It is particularly useful for preparing categorical data for machine learning models where numerical input
is required. The current mapping is defined as {'protein_coding': 0, 'lncRNA': 1, 'rep': 2}, which are common
categories for gene classification tasks.
Parameters:
labels (list of str): A list of text labels to convert. Each label must be one of the keys in the predefined
dictionary (i.e., 'protein_coding', 'lncRNA', 'rep').
Returns:
list of int: A list of integers representing the converted labels.
Example:
>>> convert_labels(['protein_coding', 'lncRNA', 'rep', 'lncRNA'])
[0, 1, 2, 1]
"""
dico = {'protein_coding': 0, 'lncRNA' : 1, 'rep' : 2}
num_labels = list()
for label in labels:
num_labels.append(dico[label])
return num_labels
labels = convert_labels(all_labels)
print(f'labels_number {len(all_labels)}, sequences_number {len(seq_one_hot_code)}')