-
Notifications
You must be signed in to change notification settings - Fork 3
/
GenestoJunctions.py
executable file
·162 lines (134 loc) · 6.83 KB
/
GenestoJunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
@authors: Juan L. Trincado
@email: [email protected]
GenestoJunctions.py: The next code will get the unique genes associated to each junction and will associate this list of genes to each junctions in the original bed file
Additionally, we will study the type of each junction (1: Annotated, 2: New connection, 3: 5ss, 4: 3ss, 5: New junction)
arg[1]: Input file --> SJ.out.enriched.filtered.bed: file with the exons that overlaps exactly with the junction
arg[2]: Output file --> SJ.out.junction.type: output file with the list junction and the type of each junction
"""
import logging, sys
import pandas as pd
# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create console handler and set level to info
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
def consecutives_exons(dict):
'''
Check if in the dictionary there are two consecutive
exons associated to the same transcript
'''
# For each key in the dict, sort the list
for key in dict:
list_exons = dict[key]
sortedlist = []
for x in sorted(list_exons): sortedlist.append(x)
#Iterate the list, checking if there are consecutive exons
for i in range(1,len(sortedlist)):
if(int(sortedlist[i-1])==int(sortedlist[i])-1):
return True
return False
def main():
try:
print('\t\tGenestoJunctions.py: Starting')
# Load the fasta file
original_file_bed_filtered_path = sys.argv[1]
output_path = sys.argv[2]
# original_file_bed_filtered_path = "/projects_rg/test_Junckey/SAMPLE/SJ.out.enriched.filtered.bed"
# output_path = "/projects_rg/test_Junckey/SAMPLE/SJ.out.junction.type.bed"
# 1. Load the intersect file between the junctions and the annotation
original_file_bed_filtered = pd.read_table(original_file_bed_filtered_path, delimiter="\t", header=None)
original_file_bed_filtered.columns = ['chr', 'start', 'end', 'id', 'score', 'strand', 'annotated',
'chr2', 'info1', 'info2', 'start2', 'end2', 'dot', 'strand2',
'dot2', 'large_info', 'score2']
# 2. Get the transcript and exon_number info
# transcript_id = original_file_bed_filtered['large_info'].apply(lambda x: x.split(";")[1][16:31]).tolist()
transcript_id = original_file_bed_filtered['large_info'].apply(lambda x: x.split("transcript_id")[1].split(";")[0].strip().replace('\"','')).tolist()
# exon_number = original_file_bed_filtered['large_info'].apply(lambda x: x.split(";")[2].split("\"")[1]).tolist()
exon_number = original_file_bed_filtered['large_info'].apply(lambda x: x.split("exon_number")[1].split(";")[0].strip().replace('\"','')).tolist()
#Iterate over the genotype file
flag_Anno, flag_5s, flag_3s= 0, 0, 0
transcript_dict = {}
id = ""
output_file = open(output_path, 'w')
for i in range(len(original_file_bed_filtered)):
percentage = str(round(i*100/len(original_file_bed_filtered),3))+ "%"
sys.stdout.write("\r%s" % percentage)
sys.stdout.flush()
line = original_file_bed_filtered.iloc[i,]
if(i==0):
id = original_file_bed_filtered.iloc[i,3]
elif(id!=original_file_bed_filtered.iloc[i,3]):
# # Case 1: Annotated junction
# if(flag_Anno==1):
# output_file.write(id+"\t1\n")
# # Case 2: New connection (from annotated exons)
# elif(flag_5s==1 & flag_3s==1):
# #If there are two exons folowwing each other, then it's a case 1
# if(consecutives_exons(transcript_dict)):
# output_file.write(id + "\t1\n")
# else:
# output_file.write(id+"\t2\n")
# Case 1: Annotated junction
if(flag_5s==1 & flag_3s==1):
#If there are two exons folowwing each other, then it's a case 1
if(consecutives_exons(transcript_dict)):
# Case 1: Annotated junction
output_file.write(id + "\t1\n")
else:
# Case 2: New connection (from annotated exons)
output_file.write(id+"\t2\n")
# Case 3: A5ss
elif (flag_5s == 1 and flag_3s == 0):
output_file.write(id+"\t3\n")
# Case 4: A3ss
elif (flag_5s == 0 and flag_3s == 1):
output_file.write(id+"\t4\n")
# Case 0: Not studied
else:
output_file.write(id+"\t \n")
id = original_file_bed_filtered.iloc[i, 3]
flag_Anno, flag_5s, flag_3s = 0, 0, 0
transcript_dict = {}
# Check the new line
# if(original_file_bed_filtered.iloc[i,6]==1):
# flag_Anno = 1
# elif (original_file_bed_filtered.iloc[i,1]==original_file_bed_filtered.iloc[i,11]-1):
if (original_file_bed_filtered.iloc[i, 1] == original_file_bed_filtered.iloc[i, 11] - 1):
flag_5s = 1
#Save the transcript id and the exon number
#If the key already exists, add it to the list of exons
if(transcript_id[i] in transcript_dict):
transcript_dict[transcript_id[i]].append(exon_number[i])
#If the key doesn't exist, create a new list
else:
transcript_dict[transcript_id[i]] = [exon_number[i]]
elif (original_file_bed_filtered.iloc[i, 2] == original_file_bed_filtered.iloc[i, 10]):
flag_3s = 1
#Save the transcript id and the exon number
#If the key already exists, add it to the list of exons
if(transcript_id[i] in transcript_dict):
transcript_dict[transcript_id[i]].append(exon_number[i])
#If the key doesn't exist, create a new list
else:
transcript_dict[transcript_id[i]] = [exon_number[i]]
percentage = "100%\n"
sys.stdout.write("\r%s" % percentage)
# 3. Close the file handler
print('\t\tGenestoJunctions.py: Saved ' + output_path)
output_file.close()
print('\t\tGenestoJunctions.py: Finish')
exit(0)
except Exception as error:
logger.error('\nERROR: ' + repr(error))
logger.error('\t\tGenestoJunctions.py: Aborting execution')
exit(1)
if __name__ == '__main__':
main()