forked from CNuge/snp-placer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
place_snps.py
executable file
·160 lines (122 loc) · 5.29 KB
/
place_snps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python
import gc
import argparse
import pandas as pd
from itertools import groupby
from pandas import Series, DataFrame
from parse import cigarParse
from parse import samParse
def read_input_files(list_of_inputs, names = None):
""" read in a list of files (from argparse), turn them into dataframes
and concatenate them if the length of the list exceeds 1."""
if len(list_of_inputs) == 1:
data = pd.read_table(list_of_inputs[0], sep='\t', names = names, index_col=None)
else:
data_files = []
for i in list_of_inputs:
data_files.append(pd.read_table(i, sep='\t',names = names, index_col=None))
data = pd.concat(data_files)
return data
def read_sam_files(list_of_inputs):
""" this calls the read_input_files and stores the sam_header """
sam_header = ['Qname','Flag','Rname','Pos','MapQ','Cigar','Rnext',
'Pnext', 'TLEN', 'SEQ', 'QUAL','tag','type','value','value2']
return read_input_files(list_of_inputs, sam_header)
def sam_subset(snp_names, sam_file_df):
""" take the subset of rows from the sam file that contain snps found
in the snp list. Equivalent to an inner join"""
sam_rows = sam_file_df.loc[sam_file_df['Qname'].isin(snp_names)]
return sam_rows
def sam_polymorphism_column_merger(sam_dataframe, snp_dataframe):
""" grab the bp, and polymorphisms from the snp dataframe
note that if two snps on one contig, there will be multiple
rows for that snp"""
return pd.merge(sam_dataframe, snp_dataframe,
how='left', left_on='Qname', right_on='SNP')
def calculate_new_bp_data(sam_dataframe):
""" pass in sam df, this will call the relevant cigar functions
determining the length of the alignment, and the adjusted
position of the bp based on cigar alignment
apply cigarParse bp adjustment to each row"""
sam_dataframe['adjusted_bp_SNP_location'] = sam_dataframe.apply(
lambda x: cigarParse.cigar_string_change(x['bp'],
x['Cigar']) , axis=1)
""" count the alignment length for each row using cigar """
sam_dataframe['alignment_length'] = sam_dataframe.apply(
lambda x: cigarParse.alignment_length(x['Cigar']) , axis=1)
return sam_dataframe
def snp_placement_dataframe(sam_dataframe):
"""apply snp_contig_location across a dataframe """
sam_dataframe['contig_location'] = sam_dataframe.apply(
lambda x: samParse.snp_contig_location(x['Flag'],
x['Pos'],
x['adjusted_bp_SNP_location'],
x['alignment_length']), axis=1)
return sam_dataframe
def output_to_vcf(output_df):
""" take the informaiton in the dataframe and turn it
into .vcf file format with the following header:
#CHROM POS ID REF ALT QUAL FILTER INFO"""
if type(output_df['SNP'][0]) == str:
output_df['adj_name'] = output_df['SNP'] + \
'_' + \
output_df['Polymorphism'] + \
'_' + \
output_df['bp'].astype(str)
else:
output_df['adj_name'] = output_df['SNP'].astype(str) + \
'_' + \
output_df['Polymorphism'] + \
'_' + \
output_df['bp'].astype(str)
output_df['full_adj_name'] = output_df.apply(
lambda x: samParse.compliment_name(x['adj_name'],
x['Flag']), axis=1)
vcf_out = output_df[['Rname',
'contig_location',
'full_adj_name',
'Polymorphism',
'MapQ',
'Flag']].copy()
vcf_out['FILTER'] = '.'
vcf_out['INFO'] = '.'
vcf_out['QUAL'] = '.'
vcf_out['REF_check'] = vcf_out['Polymorphism'].apply(
lambda x: x.split('/')[0])
vcf_out['ALT_a'] = vcf_out['Polymorphism'].apply(
lambda x: x.split('/')[1:])
vcf_out['ALT_check'] = vcf_out['ALT_a'].apply(
lambda x: ','.join(x))
vcf_out['REF'] = vcf_out.apply(
lambda x: samParse.allele_comp_check(x['REF_check'],
x['Flag']), axis=1)
vcf_out['ALT'] = vcf_out.apply(
lambda x: samParse.allele_comp_check(x['ALT_check'],
x['Flag']), axis=1)
vcf_out = vcf_out[['Rname','contig_location','full_adj_name','REF','ALT','MapQ','FILTER','INFO']]
vcf_out.columns =['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO']
return vcf_out
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-s','--samfile', nargs = '+', required = True,
help = 'The sam file(s) you wish to process. Pass in multiple files behind one flag. \
i.e. -s ex1.sam ex2.sam')
parser.add_argument('-p', '--snpfile', nargs = '+', required = True,
help = 'The snp file(s) you wish to process. Pass in multiple files behind one flag. \
i.e. -p ex1.txt ex2.txt')
parser.add_argument('-o', '--output', default = 'placed_snps.vcf',
help = 'The name of the output .vcf file. Default is placed_snps.vcf')
args = parser.parse_args()
#note these must receive lists as first argument
sam_dat = read_sam_files(args.samfile)
snp_input_dat = read_input_files(args.snpfile)
#subset the sam alignments for rows matching the snp input
sam_data_on_contigs = sam_subset(snp_input_dat['SNP'], sam_dat)
#merge the snp and sam dataframes
all_polymorphism_data = sam_polymorphism_column_merger(sam_data_on_contigs, snp_input_dat)
#calculate new bp data
all_polymorphism_data = calculate_new_bp_data(all_polymorphism_data)
all_polymorphism_data = snp_placement_dataframe(all_polymorphism_data )
#default .vcf has name 'placed_snps.vcf'
polymorphism_vcf = output_to_vcf(all_polymorphism_data)
polymorphism_vcf.to_csv(args.output, sep='\t',index=False)