-
Notifications
You must be signed in to change notification settings - Fork 0
/
select_ensembl_gtfid_from_ensembl_gffid.py
161 lines (132 loc) · 4.75 KB
/
select_ensembl_gtfid_from_ensembl_gffid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/python
# select_ensembl_gtfid_from_ensembl_gffid.py
# Format Ensembl gtf files to cope with the formatted gff3 files
# Sophie Lemoine
# Genomicpariscentre
# Usage : python select_ensembl_gtfid_from_ensembl_gffid.py -o Ensembl organism name -e Ensembl version -v
# Arguments :
# -o or --organism Ensembl organism name Ex: Mus_musculus
# -e or --ensemblversion Ensembl version Ex: 84
# -v or --verbose
# Exemple :
# python select_ensembl_gtfid_from_ensembl_gffid.py -o Mus_musculus -e 84 -v
from validannot_env import gff3_path
from validannot_env import gtf_path
from validannot_env import log_path
import argparse
import subprocess
import time
import fileinput
import os
import sys
import logging
#############
# Functions #
#############
# Checks if the path and file exist
# Argv = path and file
# Returns a var that can be a boolean or an integer
# If true, path and gz file exist, if false, path and gunzipped file exist, if 2, path or file do not exist
def make_sure_file_exists(p,f):
if os.path.exists(p):
if os.path.isfile(p+"/"+f[:-3]):
logging.info("Found gunzipped file: "+ f[:-3] + "\n No need to gunzip...")
gz = False
elif os.path.isfile(p+"/"+f):
logging.info("Found gzipped file: "+ f +"...")
gz = True
else:
gz=2
else:
gz=2
return gz
# Gets the chromosomes names in the gff3 file to format the fasta file
# Argv = the gff file
# Returns a chromosome list
def get_chromosome_from_ensemblgff(gff_in):
wanted = []
for line in fileinput.input(gff_in):
if line.startswith('#'):
continue
gff_fields = line.strip().split('\t')
if gff_fields[2] == 'chromosome' and not '_' in gff_fields[0]:
wanted.append(gff_fields[0])
logging.info("chromosome list for "+gff_in+" : "+','.join(wanted))
fileinput.close()
return wanted
# Add lines in the gtf file header
# Argv = the gtf file handle
# Returns a new header
def format_new_header(gtfin_h):
header_fields = []
for line in gtfin_h:
if line.startswith('#!genome-') or line.startswith('#!genebuild-'):
header_fields.append(line)
header_fields.append('#!modified by SGDB on the '+time.strftime("%Y-%m-%d")+'\n')
header_fields.append('#!regular chromosome only gtf file\n')
return header_fields
#############
# Arguments and usage
parser = argparse.ArgumentParser(description='A script to build a gtf file containing only the chromosomes described in a gff3 file.')
parser.add_argument('-o', '--organism', help='Ensembl organism name (Ex: Mus_musculus)', required=True)
parser.add_argument('-e', '--ensemblversion', help='Ensembl version Ex: 84', required=True)
parser.add_argument('-v', '--verbose', help='Increase output verbosity', action='store_true')
args = vars(parser.parse_args())
organism = args['organism']
version = args['ensemblversion']
if args['verbose']:
logging.basicConfig(filename=log_path +'select_ensembl_gtfid_from_gffid.log',level=logging.INFO,format='%(asctime)s %(message)s')
gff_in = "only_chr_" + organism + "_ens" + version + "_sgdb.gff.gz"
# Check if gff coming in exists
isgz_gff = make_sure_file_exists(gff3_path,gff_in)
# gunzip or not gff
if type(isgz_gff) is int:
logging.info(os.strerror(isgz_gff))
sys.exit(1)
else:
if not isgz_gff:
gff_in = gff_in[:-3]
else:
inF = gff3_path + gff_in
proc = subprocess.Popen(["gunzip", gff3_path + gff_in], stderr=subprocess.PIPE)
for line in proc.stderr:
logging.info(line)
gfffile_in = gff_in[:-3]
gtf_in = organism + "_ens" + version + ".gtf.gz"
# Check if gtf coming in exists
isgz_gtf = make_sure_file_exists(gtf_path, gtf_in)
# gunzip or not gtf
if type(isgz_gtf) is int:
logging.info(os.strerror(isgz_gtf))
sys.exit(1)
else:
if not isgz_gtf:
gtf_in = gtf_in[:-3]
else:
inF = gtf_path + gtf_in
proc = subprocess.Popen(["gunzip", gtf_path + gtf_in], stderr=subprocess.PIPE)
for line in proc.stderr:
logging.info(line)
gtf_in = gtf_in[:-3]
# GFF analysis
wanted = get_chromosome_from_ensemblgff(gff3_path+gff_in)
logging.info("writing new gtf file....")
# Open gtf coming in
gtfin = open(gtf_path+gtf_in, "rU")
# Create output gtf file and handle it
gtf_out = gtf_path + "only_chr_" + gtf_in
gtfout = open(gtf_out, "w")
# Manage header
header = format_new_header(gtfin)
for h in header:
gtfout.write(h)
gtfin.seek(0)
# Read and select data
for line in gtfin:
if not line.startswith('#!'):
gtf_fields = line.strip().split('\t')
if gtf_fields[0] in wanted:
gtfout.write(line)
gtfout.close()
gtfin.close()
logging.info("New gtf file written.")