-
Notifications
You must be signed in to change notification settings - Fork 3
/
count_all_gene_families.py
150 lines (119 loc) · 7.2 KB
/
count_all_gene_families.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python
# coding: utf-8
__author__ = 'LIN Dan'
import argparse
import os
from Bio import SeqIO
# Change format
def turn_bioseq_into_list(file):
bio_seq = SeqIO.parse(open(file), 'fasta')
seq_list = list()
for seq in bio_seq:
seq_list.append(seq)
return seq_list
# Count the number of all kinds of gene families
def count_gene_families(seq_list, i, f_single_copy_9_species, f_present_9_species, f_single_copy_7_species, f_present_7_species, f_single_copy_5_species, f_present_5_species, f_single_copy_2_species, f_present_2_species, f_unshared_gene_families, f_singletons_laboriosa, f_singletons_dorsata):
Temnothorax = list()
Dinoponera = list()
Mellifera = list()
Cerana = list()
Florea = list()
Terrestris = list()
Impatiens = list()
Laboriosa = list()
Dorsata = list()
for seq in seq_list:
if 'Temnothorax' in seq.id:
Temnothorax.append(seq)
if 'Dinoponera' in seq.id:
Dinoponera.append(seq)
if 'Mellifera' in seq.id:
Mellifera.append(seq)
if 'Cerana' in seq.id:
Cerana.append(seq)
if 'Florea' in seq.id:
Florea.append(seq)
if 'Terrestris' in seq.id:
Terrestris.append(seq)
if 'Impatiens' in seq.id:
Impatiens.append(seq)
if 'Laboriosa' in seq.id:
Laboriosa.append(seq)
if 'Dorsata' in seq.id:
Dorsata.append(seq)
else:
pass
if all([len(Temnothorax) == 1, len(Dinoponera) == 1, len(Mellifera) == 1, len(Cerana) == 1, len(Florea) == 1, len(Dorsata) == 1, len(Terrestris) == 1, len(Impatiens) == 1, len(Laboriosa) == 1]):
f_single_copy_9_species.write(str(i) + '\n')
if all([len(Temnothorax) > 1, len(Dinoponera) > 1, len(Mellifera) > 1, len(Cerana) > 1, len(Florea) > 1, len(Dorsata) > 1, len(Terrestris) > 1, len(Impatiens) > 1, len(Laboriosa) > 1]):
f_present_9_species.write(str(i) + '\t' + str(len(Laboriosa)) + '\t' + str(len(Dorsata)) + '\n')
if all([len(Temnothorax) == 0, len(Dinoponera) == 0, len(Mellifera) == 1, len(Cerana) == 1, len(Florea) == 1, len(Dorsata) == 1, len(Terrestris) == 1, len(Impatiens) == 1, len(Laboriosa) == 1]):
f_single_copy_7_species.write(str(i))
f_single_copy_7_species.write('\n')
if all([len(Temnothorax) == 0, len(Dinoponera) == 0, len(Mellifera) > 1, len(Cerana) > 1, len(Florea) > 1, len(Dorsata) > 1, len(Terrestris) > 1, len(Impatiens) > 1, len(Laboriosa) > 1]):
f_present_7_species.write(str(i) + '\t' + str(len(Laboriosa)) + '\t' + str(len(Dorsata)) + '\n')
if all([len(Temnothorax) == 0, len(Dinoponera) == 0, len(Mellifera) == 1, len(Cerana) == 1, len(Florea) == 1, len(Dorsata) == 1, len(Terrestris) == 0, len(Impatiens) == 0, len(Laboriosa) == 1]):
f_single_copy_5_species.write(str(i) + '\n')
if all([len(Temnothorax) == 0, len(Dinoponera) == 0, len(Mellifera) > 1, len(Cerana) > 1, len(Florea) > 1, len(Dorsata) > 1, len(Terrestris) == 0, len(Impatiens) == 0, len(Laboriosa) > 1]):
f_present_5_species.write(str(i) + '\t' + str(len(Laboriosa)) + '\t' + str(len(Dorsata)) + '\n')
if all([len(Temnothorax) == 0, len(Dinoponera) == 0, len(Mellifera) == 0, len(Cerana) == 0, len(Florea) == 0, len(Dorsata) == 1, len(Terrestris) == 0, len(Impatiens) == 0, len(Laboriosa) == 1]):
f_single_copy_2_species.write(str(i) + '\n')
if all([len(Temnothorax) == 0, len(Dinoponera) == 0, len(Mellifera) == 0, len(Cerana) == 0, len(Florea) == 0, len(Dorsata) > 1, len(Terrestris) == 0, len(Impatiens) == 0, len(Laboriosa) > 1]):
f_present_2_species.write(str(i) + '\t' + str(len(Laboriosa)) + '\t' + str(len(Dorsata)) + '\n')
if all([len(Temnothorax) + len(Dinoponera) + len(Mellifera) + len(Cerana) + len(Florea) + len(Terrestris) + len(Impatiens) > 0, len(Laboriosa) > 0, len(Dorsata) == 0]):
f_unshared_gene_families.write(str(i) + '\t' + str(len(Laboriosa)) + '\n')
if all([len(Temnothorax) + len(Dinoponera) + len(Mellifera) + len(Cerana) + len(Florea) + len(Terrestris) + len(Impatiens) > 0, len(Dorsata) > 0, len(Laboriosa) == 0]):
f_unshared_gene_families.write(str(i) + '\t' + str(len(Dorsata)) + '\n')
if all([len(Temnothorax) == 0, len(Dinoponera) == 0, len(Mellifera) == 0, len(Cerana) == 0, len(Florea) == 0, len(Dorsata) == 0, len(Terrestris) == 0, len(Impatiens) == 0, len(Laboriosa) > 0]):
f_singletons_laboriosa.write(str(i) + '\t' + str(len(Laboriosa)) + '\n')
if all([len(Temnothorax) == 0, len(Dinoponera) == 0, len(Mellifera) == 0, len(Cerana) == 0, len(Florea) == 0, len(Dorsata) > 0, len(Terrestris) == 0, len(Impatiens) == 0, len(Laboriosa) == 0]):
f_singletons_dorsata.write(str(i) + '\t' + str(len(Dorsata)) + '\n')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--gene_family_directory',
required = True,
type = str,
help = 'Path to gene families folder. Please provide the absolute path.')
parser.add_argument('--file_suffix',
required = True,
type = str,
help = 'Suffix of the gene families files, e.g., cds.muscle')
parser.add_argument('--gene_family_number',
required = True,
type = int,
help = 'Number of the gene families files')
args = parser.parse_args()
gene_family_directory = args.gene_family_directory
file_suffix = args.file_suffix
gene_family_number = args.gene_family_number
os.chdir(gene_family_directory)
f_single_copy_9_species = open('single_copy_9_species', 'a')
f_present_9_species = open('present_9_species', 'a')
f_single_copy_7_species = open('single_copy_7_species', 'a')
f_present_7_species = open('present_7_species', 'a')
f_single_copy_5_species = open('single_copy_5_species', 'a')
f_present_5_species = open('present_5_species', 'a')
f_single_copy_2_species = open('single_copy_2_species', 'a')
f_present_2_species = open('present_2_species', 'a')
f_unshared_gene_families = open('unshared_gene_families', 'a')
f_singletons_laboriosa = open('singletons_laboriosa', 'a')
f_singletons_dorsata = open('singletons_dorsata', 'a')
target_file = [str(i) + file_suffix for i in range(0, gene_family_number)]
length = len(target_file)
for i in range(0, length):
print('Now, openning the %d th gene family' %i)
seq_list = turn_bioseq_into_list(target_file[i])
count_gene_families(seq_list, i, f_single_copy_9_species, f_present_9_species, f_single_copy_7_species, f_present_7_species, f_single_copy_5_species, f_present_5_species, f_single_copy_2_species, f_present_2_species, f_unshared_gene_families, f_singletons_laboriosa, f_singletons_dorsata)
f_single_copy_9_species.close()
f_single_copy_7_species.close()
f_single_copy_5_species.close()
f_single_copy_2_species.close()
f_present_9_species.close()
f_present_7_species.close()
f_present_5_species.close()
f_present_2_species.close()
f_unshared_gene_families.close()
f_singletons_laboriosa.close()
f_singletons_dorsata.close()
if __name__ == "__main__":
main()