-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsnp_summary.pl
executable file
·214 lines (167 loc) · 5.37 KB
/
snp_summary.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/perl
#snp_summary.pl by Megan Supple, 2 Mar 2012
#script to summarize snp calls from the matrix genotype file
#usage snp_summary.pl <in.matrix> <ref.fasta>
#<in.matrix> is the matrix genotype format file generated from vcf2format.pl
#<ref.fasta> is a fasta reference file
#output is
#STDOUT the number of positions with variation in any of the samples
#genotyped_summary.txt number of positions genotyped per sample
#het_summary.txt number of heterozygote snps per sample
#snp_summary.txt number of snps relative to the reference per sample
use lib $ENV{PERL5LIB};
use strict;
use warnings;
my $usage = "Usage: snp_summary.pl <in.matrix> <ref.fasta>
options (required):
<in.matrix> a matrix of genotypes (formatted as in vcf2format.pl)
<ref.fasta> a reference sequence in fasta format
";
die "$usage" unless (@ARGV == 2);
#read in command line arguments
my ($ingenos, $inref)=@ARGV;
#open input file and output file for snp summar
open(GENOS, $ingenos)||die "can't open input fst file. $!\n";
open(OUTGENO, ">genotyped_summary.txt");
open(OUTSNP, ">snp_summary.txt");
open(OUTHET, ">het_summary.txt");
#read in reference sequence
open(REF, $inref)||die "can't open input fst file. $!\n";
my %ref;
my $contig;
my $seq="";
while (my $fasta=<REF>)
{
chomp($fasta);
#check if new contig, if so add to hash
if ($fasta=~ m/^>.*/)
{
#record contig name
$contig=substr($fasta, 1);
#reset seq to null
$seq="";
}
else
{
#add seq
$seq.=$fasta;
}
#add contig and seq to hash
$ref{$contig}=$seq;
}
#my @list=keys(%ref);
#print @list;
#print input parameters to output file
print OUTGENO "input genotypes file=$ingenos\nnumber of positions genotyped\n";
print OUTSNP "input genotypes file=$ingenos\nnumber of snps relative to the referece\n";
print OUTHET "input genotypes file=$ingenos\nnumber of heterozygote snps\n";
#read header info
my $line=<GENOS>;
my @samples=split(" ", $line);
#remove non sample information
shift(@samples);
shift(@samples);
my $n_samples=@samples;
print "number of samples=$n_samples\n";
#print headers
print "contig\tnum_snps_among_samples\n";
my $samples=join("\t",@samples);
print OUTGENO "contig\t$samples\n";
print OUTSNP "contig\t$samples\n";
print OUTHET "contig\t$samples\n";
my @genoed=(0)x$n_samples;
my $genoed;
my @hets=(0)x$n_samples;
my $hets;
my @ref_snps=(0)x$n_samples;
my $ref_snps;
my @temp_genos=();
my $pop_snps=0;
my $current_contig="";
my @contig_seq;
#process input file until EOF
while($line=<GENOS>)
{
#break up line into component parts
my @entry=split(" ", $line);
#if new contig
if ($entry[0] ne $current_contig)
{
#print out last contig results, unless first contig
if ($current_contig ne "")
{
$genoed=join("\t",@genoed);
$ref_snps=join("\t",@ref_snps);
$hets=join("\t",@hets);
print OUTGENO "$current_contig\t$genoed\n";
print OUTSNP "$current_contig\t$ref_snps\n";
print OUTHET "$current_contig\t$hets\n";
print "$current_contig\t$pop_snps\n";
}
#zero values and start with new contig information
$current_contig=$entry[0];
$pop_snps=0;
@contig_seq=split("",$ref{$current_contig});
#loop over individuals and start new counts
for (my $i=0; $i<$n_samples; $i++)
{
@genoed=(0) x $n_samples;
@ref_snps=(0) x $n_samples;
@hets=(0)x$n_samples;
if ($entry[$i+2] ne "N")
{
#increment genoed
$genoed[$i]++;
#determine if SNP relative to reference
if ($entry[$i+2] ne @contig_seq[$entry[1]+1])
{
$ref_snps[$i]++;
}
#make list of genotypes, excluding Ns
push (@temp_genos, $entry[$i+2]);
}
if ($entry[$i+2] ne "N" && $entry[$i+2] ne "A" && $entry[$i+2] ne "T" && $entry[$i+2] ne "C" && $entry[$i+2] ne "G") {$hets[$i]++}
}
#determine if there is variation at this position within the population
my %string = map { $_, 1 } @temp_genos;
if (keys %string != 1 && @temp_genos) {$pop_snps++;}
}
#if same contig
else
{
my @temp_genos=();
#loop over individuals and increment counts
for (my $i=0; $i<$n_samples; $i++)
{
if ($entry[$i+2] ne "N")
{
#increment genoed
$genoed[$i]+=1;
#determine if SNP relative to reference
if ($entry[$i+2] ne @contig_seq[$entry[1]-1])
{
$ref_snps[$i]++;
}
#make list of genotypes, excluding Ns
push (@temp_genos, $entry[$i+2]);
}
if ($entry[$i+2] ne "N" && $entry[$i+2] ne "A" && $entry[$i+2] ne "T" && $entry[$i+2] ne "C" && $entry[$i+2] ne "G") {$hets[$i]++}
}
#determine if there is variation at this position in any sample
my %string = map { $_, 1 } @temp_genos;
if (keys %string != 1 && @temp_genos) {$pop_snps++;}
}
}
#print out final results
$genoed=join("\t",@genoed);
$hets=join("\t",@hets);
$ref_snps=join("\t",@ref_snps);
print OUTGENO "$current_contig\t$genoed\n";
print OUTSNP "$current_contig\t$ref_snps\n";
print OUTHET "$current_contig\t$hets\n";
print "$current_contig\t$pop_snps\n";
close GENOS;
close OUTGENO;
close OUTSNP;
close OUTHET;
print "done!\n";