-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_seqs_from_gff.py
executable file
·138 lines (119 loc) · 6.75 KB
/
get_seqs_from_gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/python
import csv
from sys import argv
"""
This script will read a fasta file and a gff and cut out the sequence from the fasta specified in the gff start and
stop fields. Important: make sure your fasta headers and gff headers (field 1) can match to each other.
Usage:
get_seqs_from_gff.py <in_sequences.fasta> <in_coordinates.gff> <output_filename.fasta> <field_name>
where field name is the third column in the gff file (gene/CDS/exon/intron etc.)
By: Alvar Almstedt ([email protected])
"""
def read_fasta(fasta): # This fucntion reads the fasta input as argument 1 and returns as a dict where header : seq
file_contents = {}
data = fasta.split("\n")
header = None
for i in data:
if i.startswith(">"):
header = str(i)
else:
file_contents[header[1:].rstrip()] = i.rstrip()
return file_contents
def clean_input_fasta(fastafile): # This function cleans away any possible wrapping from the input fasta
with open(fastafile, "r+") as infile:
fixedfasta = ""
iterator1 = 0
for line in infile:
if line.startswith(">") and infile:
if iterator1 != 0:
fixedfasta += "\n"
fixedfasta += str(line)
iterator1 = 1
else:
fixedfasta += str(line.rstrip())
iterator1 = 1
return fixedfasta
def read_gff(gff_in): # This function reads the gff file input as argument 2 as a list of dicts (see csv module)
with open(gff_in, "r") as infile:
fieldnames = ["headername", "method", "type", "start", "stop", "dot1", "plus", "dot2", "notes"]
gff_reader = csv.DictReader(infile, delimiter='\t', fieldnames=fieldnames)
file_contents = []
for i in gff_reader:
file_contents.append(i)
return file_contents
def write_fasta_outfile(fasta_in, gff_in, outfile): # This function parses the gff and fasta and writes out regions from
first = 0 # the fasta within coordinates from the gff "CDS" or "protein_coding_region"
with open(outfile, "w+") as fasta_out:
for fasta_keys in fasta_in:
for gff_line in gff_in:
if set(gff_line["headername"].split()) & set(fasta_keys.split()):
if gff_line["type"] == type_to_cut:
start = int(gff_line["start"]) - 1 # minus 1 fixes off by one error
stop = int(gff_line["stop"])
length = str(int(gff_line["stop"]) - int(gff_line["start"]))
if first != 0:
fasta_out.write("\n")
fasta_out.write(">" + gff_line["headername"] + "_" + gff_line["type"] + "_from_" +
gff_line["start"] + "bp_to_" + gff_line["stop"] + "bp_length_" +
length + "bp" + "\n")
first = 1
fasta_out.write(fasta_in[fasta_keys][int(start):int(stop)])
def remove_introns(fasta_in, gff_in, outfile): # under construction. this could be done in a smarter way by putting
first = 0 # by putting the exons in a list or something. im dumb :(
with open(outfile, "w+") as fasta_out:
for fasta_keys in fasta_in:
gene = ""
gene_first_region = ""
gene_second_region = ""
gene_third_region = ""
gene_fourth_region = ""
gene_fifth_region = ""
for gff_line in gff_in:
if set(gff_line["headername"].split()) & set(fasta_keys.split()):
if gff_line["type"] == type_to_cut:
gene_start = int(gff_line["start"]) - 1 # minus 1 fixes off by one error
gene_stop = int(gff_line["stop"])
length = str(int(gff_line["stop"]) - int(gff_line["start"]))
gene = fasta_in[fasta_keys][int(gene_start):int(gene_stop)]
if first != 0:
fasta_out.write("\n")
fasta_out.write(">" + gff_line["headername"] + "_" + gff_line["type"] + "_from_" +
gff_line["start"] + "bp_to_" + gff_line["stop"] + "bp_length_" +
length + "bp" + "\n")
elif gff_line["type"] == "intron" and not gene_first_region:
first_intron_start = int(gff_line["start"]) - 1
first_intron_stop = int(gff_line["stop"])
gene_first_region = fasta_in[fasta_keys][:int(first_intron_start)]
# print "first elif"
elif gff_line["type"] == "intron" and gene_first_region and not gene_second_region:
second_intron_start = int(gff_line["start"]) - 1
second_intron_stop = int(gff_line["stop"])
gene_second_region = fasta_in[fasta_keys][first_intron_stop:second_intron_start]
# print "second elif"
elif gff_line["type"] == "intron" and gene_second_region and not gene_third_region:
third_intron_start = int(gff_line["start"]) - 1
third_intron_stop = int(gff_line["stop"])
gene_third_region = fasta_in[fasta_keys][second_intron_stop:third_intron_start]
# print "third elif"
elif gff_line["type"] == "intron" and gene_third_region and not gene_fourth_region:
fourth_intron_start = int(gff_line["start"]) - 1
fourth_intron_stop = int(gff_line["stop"])
gene_fourth_region = fasta_in[fasta_keys][third_intron_stop:fourth_intron_start]
gene_fifth_region = fasta_in[fasta_keys][fourth_intron_stop:]
# print "fifth elif"
if gff_line["type"] != "intron" and gff_line["type"] != type_to_cut and first != 0:
intron_free_gene = gene_first_region + gene_second_region + gene_third_region + gene_fourth_region + gene_fifth_region
fasta_out.write(intron_free_gene)
print "last if"
else:
fasta_out.write(gene)
print "no introns"
first = 1
in_file_fasta = argv[1]
in_file_gff = argv[2]
output_file_name = argv[3]
type_to_cut = argv[4]
ready_fasta = read_fasta(clean_input_fasta(in_file_fasta))
ready_gff = read_gff(in_file_gff)
remove_introns(ready_fasta, ready_gff, output_file_name)
# write_fasta_outfile(ready_fasta, ready_gff, output_file_name)