-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvcf2format.pl
executable file
·177 lines (143 loc) · 4.94 KB
/
vcf2format.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/perl
#vcf2format.pl by Megan Supple 27 May 2013
#last updated 8 Nov 2013
#script to convert vcf file(s) into various formats
#option to specifiy a single contig
#use IUPAC ambiguity codes
#possible output formats
#fasta
#matric
#usage: vcf2format.pl [options] <infile>
#<infile> is a text file that lists each race and vcf files, each file on a single line
#race1 race1.vcf
#race2 race2.vcf
use strict;
use warnings;
use lib $ENV{PERL5LIB};
use FormatGenos;
use Bio::PopGen::Population;
use Getopt::Long;
use Data::Dumper;
#read in command line arguments
my $contig; #contig of interest
my $fasta; #generate a fasta file
my $matrix; #generates a matrix file
GetOptions ( "contig=s" => \$contig,
"fasta" => \$fasta,
"matrix" => \$matrix
);
my $usage = "Usage: vcf2format.pl [options] <infile>
options:
<infile> an infile that lists each race and vcf file, each file on a single line
-contig <string> specify a single contig
-fasta generate fasta file
-matrix generate a matrix file
";
die "$usage" unless (@ARGV == 1);
my ($infile) = @ARGV;
#declare some variables
my @pop_names; #array with names of all pops
my @pops=(); #data structure with genotype information
#open and read in input file
open(INFILE, $infile)||die "can't open input file. $!\n";
my @input=<INFILE>;
close INFILE;
#check and see if any output format is specified
($fasta || $matrix)||die "no output format specified\n";
#create a hash of contig names and sizes from the header of the first vcf
print "gathering contig information\n";
#open vcf file
my @info=split(" ",$input[0]);
open(VCF, $info[1])||die "can't open vcf file. $!\n";
#ignore header lines until contig information
my $line;
do {$line=<VCF>;} until ($line =~ m/^##contig=<ID=.*/);
#while contig information, record lengths
my %contigs;
while ($line =~ m/^##contig=<ID=.*/)
{
my @contig_info=split("=",$line);
my @contig_id=split(",",$contig_info[2]);
my @contig_size=split(">",$contig_info[3]);
#add to hash
$contigs{$contig_id[0]} = $contig_size[0];
$line = <VCF>;
}
close VCF;
#if an input contig is specified, remove other contigs from the contig hash
if ($contig)
{
my $size=$contigs{$contig};
($size)||die "specified contig does not exist in first vcf header\n";
for (keys %contigs) { delete $contigs{$_}; }
$contigs{$contig}=$size;
}
#get list of contigs
my @contig_list=keys(%contigs);
#get genos and generate output for each contig in turn (to reduce memory usage)
while (my ($contig, $size)=each(%contigs))
{
print "processing contig=$contig ($size bp)\n";
#process each vcf
for (my $i=0; $i<@input; $i++)
{
#get race and file name from input
my @info=split(" ",$input[$i]);
#call subroutine to process the vcf file
print "processing $info[1]\n";
my $indivs_p=FormatGenos::vcf2geno($info[1],$contig);
#create the population (if it is new) and push into @pops
print "creating population\n";
my $pop_flag=@pop_names; #set flag for new population
#check if new population, if old population--reset pop flag
for (my $j=0; $j<@pop_names; $j++)
{
if ($info[0] eq $pop_names[$j]) {$pop_flag=$j}
}
#if new population, add population object and add name to pop_names array
if ($pop_flag==@pop_names)
{
push(@pops, Bio::PopGen::Population->new(-name => $info[0]));
push(@pop_names, $info[0]);
}
#add each individual to the population (get number of individuals processed from size of array pointed to, pop from pop_flag)
print "adding individuals to the population\n";
for (my $k=0; $k<@$indivs_p; $k++)
{
#add individual
$pops[$pop_flag]->add_Individual($$indivs_p[$k]);
}
}
#get a list of individuals
my @individuals=();
#for each population
for (my $i=0; $i<@pop_names; $i++)
{
my @inds=$pops[$i]->get_Individuals();
#for each individual
my $count=$pops[$i]->get_number_individuals;
for (my $j=0; $j<$count; $j++)
{
my $ind=$inds[$j]->unique_id;
push (@individuals, $ind);
}
}
#generate fasta files
if ($fasta)
{
print "generating fasta filess\n";
#call module to generate the fasta file for the contig
FormatGenos::createFasta($contig,$size,\@pop_names,\@pops);
}
#generate matrix files
if ($matrix)
{
print "generating matrix files\n";
#call module to generate the matrix file for the contig
FormatGenos::createMatrix($contig,$size,\@pop_names,\@pops);
}
#clear data from the contig
@pops=(); @pop_names=();
}
print "DONE!!!\n";
exit;