-
Notifications
You must be signed in to change notification settings - Fork 3
/
change_gtf.py
executable file
·95 lines (72 loc) · 3.76 KB
/
change_gtf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
@authors: Juan L. Trincado
@email: [email protected]
change_gtf.py: given an geneAnnotated.bed generate with GenestoJunctions.R,
it associates the genes in the junctions of this file to the ones provided in the
other junctions file
"""
import sys
import time
def main():
try:
geneAnnotated_path = sys.argv[1]
junctions_path = sys.argv[2]
output_path = sys.argv[3]
# geneAnnotated_path = "/genomics/users/juanluis/TEST/aux.sorted.geneAnnotated.bed"
# junctions_path = "/genomics/users/juanluis/TEST/Junctions_readCounts_TCGA.formatted.tab"
# output_path = "/genomics/users/juanluis/TEST/Junctions_readCounts_TCGA.formatted2.tab"
print("Starting execution: "+time.strftime('%H:%M:%S')+"\n")
############################################################################################
#1. Read the geneAnnotated_file and save all the junction-gene associations in a dictionary
# and the junction type
############################################################################################
print("Reading "+geneAnnotated_path+"...")
dict_junction_gene, dict_junction_type = {}, {}
with open(geneAnnotated_path) as f:
next(f)
for line in f:
tokens = line.rstrip().split("\t")
#TODO: The first time that I implemented this script, I needed to add1 to the end
# (I mapped the RefSeq TCGA junctions to Ensembl). NOw, I need to map the Ensembl SCLC
# junctions to Refseq, and it seems that it's not necesary this step. For now I'm gonna comment this part
# but it will be good take a look on this in the future
# Add 1 to the end cordinate and substitue ; by :
# tokens2 = tokens[0].split(";")[:-1]
# tokens2[2] = str(int(tokens2[2]) + 1)
# id_formatted = ":".join(tokens2)
# Substitue ; by :
tokens2 = tokens[0].split(";")[:-1]
id_formatted = ":".join(tokens2)
dict_junction_gene[id_formatted] = tokens[7]
dict_junction_type[id_formatted] = tokens[8]
############################################################################################
#2. Read the junctions_file and associate to the existing junctions the new saved genes
############################################################################################
print("Formatting " + junctions_path + "...")
outFile = open(output_path, 'w')
with open(junctions_path) as f:
outFile.write(next(f))
for line in f:
tokens = line.rstrip().split("\t")
#TODO: The first time that I implemented this script, the junction_file had :, but now I have ;
# id_formatted = ":".join(tokens[0].split(":")[:-1])
id_formatted = ":".join(tokens[0].split(";")[:-1])
if (id_formatted in dict_junction_gene):
tokens[6] = dict_junction_gene[id_formatted]
tokens[7] = dict_junction_type[id_formatted]
else:
#If the key doesn't exist, assign the junction to no gene and the junction type to 5 (new junction)
tokens[6] = ""
tokens[7] = str(5)
new_line = "\t".join(tokens)
outFile.write(new_line + "\n")
outFile.close()
print("Saved " + output_path)
print("\nDone. Exiting program. "+time.strftime('%H:%M:%S')+"\n")
exit(0)
except Exception as error:
print('ERROR: ' + repr(error))
print("Aborting execution")
sys.exit(1)
if __name__ == '__main__':
main()