From 71f72426ee95cb8267ee3729bd8ea886936fa801 Mon Sep 17 00:00:00 2001
From: Valentina Boeva <valentina.boeva@gmail.com>
Date: Tue, 2 Aug 2016 15:25:29 +0200
Subject: [PATCH] add scripts

---
 scripts/assess_significance.R  |  60 +++++++++++++
 scripts/freec2bed.pl           | 139 +++++++++++++++++++++++++++++
 scripts/freec2circos.pl        | 156 +++++++++++++++++++++++++++++++++
 scripts/get_fasta_lengths.pl   |  44 ++++++++++
 scripts/makeGraph.R            |  97 ++++++++++++++++++++
 scripts/makeGraph_Chromosome.R |  59 +++++++++++++
 scripts/vcf2snpFreec.pl        |  95 ++++++++++++++++++++
 7 files changed, 650 insertions(+)
 create mode 100644 scripts/assess_significance.R
 create mode 100644 scripts/freec2bed.pl
 create mode 100644 scripts/freec2circos.pl
 create mode 100644 scripts/get_fasta_lengths.pl
 create mode 100644 scripts/makeGraph.R
 create mode 100644 scripts/makeGraph_Chromosome.R
 create mode 100644 scripts/vcf2snpFreec.pl

diff --git a/scripts/assess_significance.R b/scripts/assess_significance.R
new file mode 100644
index 0000000..f466024
--- /dev/null
+++ b/scripts/assess_significance.R
@@ -0,0 +1,60 @@
+library(rtracklayer)
+
+args <- commandArgs()
+
+dataTable <-read.table(args[5], header=TRUE);
+ratio<-data.frame(dataTable)
+
+dataTable <- read.table(args[4], header=FALSE)
+cnvs<- data.frame(dataTable)
+
+ratio$Ratio[which(ratio$Ratio==-1)]=NA
+
+cnvs.bed=GRanges(cnvs[,1],IRanges(cnvs[,2],cnvs[,3]))  
+ratio.bed=GRanges(ratio$Chromosome,IRanges(ratio$Start,ratio$Start),score=ratio$Ratio)
+
+overlaps <- subsetByOverlaps(ratio.bed,cnvs.bed)
+normals <- setdiff(ratio.bed,cnvs.bed)
+normals <- subsetByOverlaps(ratio.bed,normals)
+
+#mu <- mean(score(normals),na.rm=TRUE)
+#sigma<- sd(score(normals),na.rm=TRUE)
+
+#hist(score(normals),n=500,xlim=c(0,2))
+#hist(log(score(normals)),n=500,xlim=c(-1,1))
+
+#shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000])
+#qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10)))
+#qqline(score(normals)[which(!is.na(score(normals)))], col = 2)
+
+#shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000])
+#qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10)))
+#qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2)
+
+numberOfCol=length(cnvs)
+
+for (i in c(1:length(cnvs[,1]))) {
+  values <- score(subsetByOverlaps(ratio.bed,cnvs.bed[i]))
+  #wilcox.test(values,mu=mu)
+  W <- function(values,normals){resultw <- try(wilcox.test(values,score(normals)), silent = TRUE)
+	if(class(resultw)=="try-error") return(list("statistic"=NA,"parameter"=NA,"p.value"=NA,"null.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultw}
+  KS <- function(values,normals){resultks <- try(ks.test(values,score(normals)), silent = TRUE)
+	if(class(resultks)=="try-error") return(list("statistic"=NA,"p.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultks}
+  #resultks <- try(KS <- ks.test(values,score(normals)), silent = TRUE)
+  #	if(class(resultks)=="try-error") NA) else resultks
+  cnvs[i,numberOfCol+1]=W(values,normals)$p.value
+  cnvs[i,numberOfCol+2]=KS(values,normals)$p.value
+  }
+
+if (numberOfCol==5) {
+  names(cnvs)=c("chr","start","end","copy number","status","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")  
+}
+if (numberOfCol==7) {
+  names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")  
+}
+if (numberOfCol==9) {
+  names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","somatic/germline","precentageOfGermline","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")  
+}
+write.table(cnvs, file=paste(args[4],".p.value.txt",sep=""),sep="\t",quote=F,row.names=F)
+
+
diff --git a/scripts/freec2bed.pl b/scripts/freec2bed.pl
new file mode 100644
index 0000000..431929f
--- /dev/null
+++ b/scripts/freec2bed.pl
@@ -0,0 +1,139 @@
+#!/usr/bin/perl -w
+#translate *_ratio.txt (the out put of FREEC) into a BED track
+
+use strict;
+
+my $usage = qq{
+    $0
+
+    -----------------------------
+    mandatory parameters:
+    
+    -f	 file            	file with ratio
+    -p   ploidy                 ploidy (default 2)
+    
+    -----------------------------
+    optional parameters:
+    
+    -v 				verbose mode
+    
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+## arguments
+
+my $filename = "";
+my $verbose = 0;
+my $ploidy = 2;
+
+## parse command line arguments
+
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;
+    if ( $this_arg eq '-h') {print "$usage\n"; exit; }
+
+    elsif ( $this_arg eq '-f') {$filename = shift @ARGV;}
+    elsif ( $this_arg eq '-p') {$ploidy = shift @ARGV;}   
+
+    elsif ( $this_arg eq '-v') {$verbose = 1;}
+
+    elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
+}
+
+if( $filename eq ""){
+    die "you should specify file with ratio\n";
+}
+   
+#-----------read file with pairs-----
+my $numberOfAllSites = 0;
+my $totalCount=0;
+my $lines = 0;
+
+
+#open (OUT , ">$name") or die "Cannot open file $name!!!!: $!";
+
+if ($filename eq "") {
+	print "Please specify a file with ratio!\n" if ($verbose) ;
+        exit();
+}
+
+  
+if ($filename =~ /.gz$/) {
+        open(FILE, "gunzip -c $filename |") or die "$0: can't open ".$filename.":$!\n";
+} else {
+        open (FILE, "<$filename") or die "Cannot open file $filename!!!!: $!";
+}
+
+#Chromosome	Start	Ratio	MedianRatio	CopyNumber
+#1	1	-1	1.03966	2
+#1	1001	-1	1.03966	2
+
+# to
+
+#chromA  chromStartA  chromEndA  dataValueA
+
+
+
+my ($chr,$start,$ratio,$MedRatio,$CPN) = ("","","","","");
+
+my $eventStart = "";
+my $eventEnd = "";
+my $eventValue = "";
+my $eventChr = "";
+my $count = 0;
+
+while (<FILE>) {		
+    chomp;
+    ($chr,$start,$ratio,$MedRatio,$CPN) = split /\t/;      
+    next if ($chr =~ m/^Chromosome/);        
+    $totalCount++;
+    $count++;
+    if ($eventStart eq "") {
+        $eventStart = $start;
+        $eventEnd = $start;
+        $eventValue = $MedRatio;
+        $eventChr =$chr;
+	$count =0;
+
+    } else {
+        if ($eventChr ne $chr) {
+            #new Chromosome, print the last event
+            
+            print "chr$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0);
+
+            $eventStart = $start;
+            $eventEnd = $start;
+            $eventValue = $MedRatio;
+            $eventChr =$chr;
+	    $count =0;
+
+        } else {
+             if ($eventValue ne $MedRatio) {
+                #new event, print the last event
+                print "chr$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0);
+
+                $eventStart = $start;
+                $eventEnd = $start;
+                $eventValue = $MedRatio;
+                $eventChr =$chr;
+		$count =0;
+
+             } else {
+                #move forward
+                $eventEnd = $start;                    
+             }                         
+        }	
+    }        
+}
+close FILE;
+
+#print the last event
+print "chr$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0);
+
+print STDERR "Read: $filename\t$totalCount\n";
+
+
diff --git a/scripts/freec2circos.pl b/scripts/freec2circos.pl
new file mode 100644
index 0000000..6945256
--- /dev/null
+++ b/scripts/freec2circos.pl
@@ -0,0 +1,156 @@
+#!/usr/bin/perl -w
+#translate "*_ratio.txt" (output of FREEC) into a Circos track
+
+use strict;
+
+my $usage = qq{
+    $0
+
+    -----------------------------
+    mandatory parameters:
+    
+    -f	 file            	file with ratio
+    -p   ploidy                 ploidy (default 2)
+    
+    -----------------------------
+    optional parameters:
+    
+    -v 				verbose mode
+    
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+##  arguments
+
+my $filename = "";
+my $verbose = 0;
+my $ploidy = 2;
+
+## parse command line arguments
+
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;
+    if ( $this_arg eq '-h') {print "$usage\n"; exit; }
+
+    elsif ( $this_arg eq '-f') {$filename = shift @ARGV;}
+    elsif ( $this_arg eq '-p') {$ploidy = shift @ARGV;}   
+
+    elsif ( $this_arg eq '-v') {$verbose = 1;}
+
+    elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
+}
+
+if( $filename eq ""){
+    die "you should specify file with ratio\n";
+}
+   
+ 
+
+#-----------read file with pairs-----
+my $numberOfAllSites = 0;
+my $totalCount=0;
+my $lines = 0;
+
+
+#open (OUT , ">$name") or die "Cannot open file $name!!!!: $!";
+
+if ($filename eq "") {
+	print "Please specify a file with ratio!\n" if ($verbose) ;
+        exit();
+}
+
+  
+if ($filename =~ /.gz$/) {
+        open(FILE, "gunzip -c $filename |") or die "$0: can't open ".$filename.":$!\n";
+} else {
+        open (FILE, "<$filename") or die "Cannot open file $filename!!!!: $!";
+}
+
+#Chromosome	Start	Ratio	MedianRatio	CopyNumber
+#1	1	-1	1.03966	2
+#1	1001	-1	1.03966	2
+
+# to
+
+#hsY 0 999999 0.005786
+#hsY 1000000 1999999 0.003958
+#hsY 2000000 2999999 0.002578
+#hsY 3000000 3999999 0.007451
+
+
+my ($chr,$start,$ratio,$MedRatio,$CPN) = ("","","","","");
+
+my $eventStart = "";
+my $eventEnd = "";
+my $eventValue = "";
+my $eventChr = "";
+my $count = 0;
+my $eventRatio = "";
+
+while (<FILE>) {		
+    chomp;
+    ($chr,$start,$ratio,$MedRatio,$CPN) = split /\t/;      
+    next if ($chr =~ m/^Chromosome/);        
+    $totalCount++;
+    $count++;
+    if ($eventStart eq "") {
+        $eventStart = $start;
+        $eventEnd = $start+999;
+        $eventValue = $MedRatio;
+	$eventRatio = $ratio;
+        $eventChr =$chr;
+	$count =0;
+	print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventRatio >= 0);
+
+    } else {
+        if ($eventChr ne $chr) {
+            #new Chromosome, print the last event
+            
+            print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0);
+
+            $eventStart = $start;
+            $eventEnd = $start+999;
+            $eventValue = $MedRatio;
+	    $eventRatio = $ratio;
+            $eventChr =$chr;
+	    $count =0;
+	    print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventRatio >= 0);
+
+        } else {
+             if ($eventValue ne $MedRatio) {
+                #new event, print the last event
+                print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0);
+
+                $eventStart = $start;
+                $eventEnd = $start+999;
+                $eventValue = $MedRatio;
+		$eventRatio = $ratio;
+                $eventChr =$chr;
+		$count =0;
+		print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventRatio >= 0);
+
+             } else {
+                #move forward
+                $eventEnd = $start+999;;                    
+             }                         
+        }
+	if ($count>=10) {
+	    $count =0;
+	    print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($ratio >= 0); #($eventValue >= 0); $eventRatio >0
+	    $eventStart=$eventEnd;
+
+	}
+    }        
+}
+close FILE;
+
+#print the last event
+print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventRatio >= 0);
+
+print STDERR "Read: $filename\t$totalCount\n";
+
+
diff --git a/scripts/get_fasta_lengths.pl b/scripts/get_fasta_lengths.pl
new file mode 100644
index 0000000..97c889a
--- /dev/null
+++ b/scripts/get_fasta_lengths.pl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl -w
+#prints lengths of sequences from a multifasta file
+
+use strict;
+
+my $Filename;
+
+if ($ARGV[0]) {
+	$Filename = $ARGV[0];
+}
+else {
+	print "Not enough arguments\n\n";
+	die "!\n\n";
+}
+
+my $result_file=$Filename;
+$result_file =~ s/.*\\//;
+$result_file =~ s/.*\///;
+$result_file="res_".$result_file;
+open (FILE, "<$Filename") or die "Cannot open file!!!!: $!";
+open (OUT, ">$result_file") or die "Cannot open file!!!!: $!";
+
+$_ = <FILE>;
+if (/>(.*)/) {		
+	 print OUT $1,"\t";
+}
+my $length = 0;
+
+while (<FILE>)  {
+	chomp;
+	if (/>(.*)/) {	
+		print OUT $length,"\n";
+		$length = 0;
+		 print OUT $1,"\t";
+	}
+	else {
+		$length += length($_);
+	}
+}
+
+print OUT $length,"\n";
+
+close FILE;
+close OUT;
diff --git a/scripts/makeGraph.R b/scripts/makeGraph.R
new file mode 100644
index 0000000..7a69d47
--- /dev/null
+++ b/scripts/makeGraph.R
@@ -0,0 +1,97 @@
+args <- commandArgs()
+
+dataTable <-read.table(args[5], header=TRUE);
+
+ratio<-data.frame(dataTable)
+ploidy <- type.convert(args[4])
+
+png(filename = paste(args[5],".png",sep = ""), width = 1180, height = 1180,
+    units = "px", pointsize = 20, bg = "white", res = NA)
+plot(1:10)
+op <- par(mfrow = c(5,5))
+
+maxLevelToPlot <- 3
+for (i in c(1:length(ratio$Ratio))) {
+	if (ratio$Ratio[i]>maxLevelToPlot) {
+		ratio$Ratio[i]=maxLevelToPlot;
+	}
+}
+
+
+for (i in c(1:22,'X','Y')) {
+	tt <- which(ratio$Chromosome==i)
+	if (length(tt)>0) {
+	 plot(ratio$Start[tt],ratio$Ratio[tt]*ploidy,ylim = c(0,maxLevelToPlot*ploidy),xlab = paste ("position, chr",i),ylab = "normalized copy number profile",pch = ".",col = colors()[88])
+	 tt <- which(ratio$Chromosome==i  & ratio$CopyNumber>ploidy )
+	 points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
+	
+	tt <- which(ratio$Chromosome==i  & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy)	
+	points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
+	 
+	tt <- which(ratio$Chromosome==i  & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
+	 points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[461])
+	 tt <- which(ratio$Chromosome==i)
+	 
+	 #UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
+	 #points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=4)
+	 
+	}
+	tt <- which(ratio$Chromosome==i)
+	
+	#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
+	#points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[463],cex=4)
+	
+}
+
+dev.off()
+
+if (length(args)>5) {
+	dataTable <-read.table(args[6], header=TRUE);
+	BAF<-data.frame(dataTable)
+
+	png(filename = paste(args[6],".png",sep = ""), width = 1180, height = 1180,
+	    units = "px", pointsize = 20, bg = "white", res = NA)
+	plot(1:10)
+	op <- par(mfrow = c(5,5))
+
+	for (i in c(1:22,'X','Y')) {
+	    tt <- which(BAF$Chromosome==i)
+	    if (length(tt)>0){
+		lBAF <-BAF[tt,]
+		plot(lBAF$Position,lBAF$BAF,ylim = c(-0.1,1.1),xlab = paste ("position, chr",i),ylab = "BAF",pch = ".",col = colors()[1])
+
+		tt <- which(lBAF$A==0.5)		
+		points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92])
+		tt <- which(lBAF$A!=0.5 & lBAF$A>=0)
+		points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62])
+		tt <- 1
+		pres <- 1
+
+		if (length(lBAF$A)>4) {
+			for (j in c(2:(length(lBAF$A)-pres-1))) {
+				if (lBAF$A[j]==lBAF$A[j+pres]) {	
+					tt[length(tt)+1] <- j 
+				}
+			}
+			points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4)
+			points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4)	
+		}
+
+		tt <- 1
+		pres <- 1
+		if (length(lBAF$FittedA)>4) {
+			for (j in c(2:(length(lBAF$FittedA)-pres-1))) {
+				if (lBAF$FittedA[j]==lBAF$FittedA[j+pres]) {	
+					tt[length(tt)+1] <- j 
+				}
+			}
+			points(lBAF$Position[tt],lBAF$FittedA[tt],pch = ".",col = colors()[463],cex=4)
+			points(lBAF$Position[tt],lBAF$FittedB[tt],pch = ".",col = colors()[463],cex=4)	
+		}
+
+	   }
+
+	}
+	dev.off()
+
+}
diff --git a/scripts/makeGraph_Chromosome.R b/scripts/makeGraph_Chromosome.R
new file mode 100644
index 0000000..1c45aed
--- /dev/null
+++ b/scripts/makeGraph_Chromosome.R
@@ -0,0 +1,59 @@
+args <- commandArgs()
+
+dataTable <-read.table(args[6], header=TRUE);
+print( paste (args[6],"read"))
+ratio<-data.frame(dataTable)
+chr <- type.convert(args[4])
+#chr <- 'X'
+ploidy <- type.convert(args[5])
+
+maxLevelToPlot <- 3
+for (i in c(1:length(ratio$Ratio))) {
+	if (ratio$Ratio[i]>maxLevelToPlot) {
+		ratio$Ratio[i]=maxLevelToPlot;
+	}
+}
+
+png(filename = paste(args[6],".png",sep = ""), width = 1180, height = 1180,
+    units = "px", pointsize = 20, bg = "white", res = NA)
+plot(1:10)
+op <- par(mfrow = c(2,1))
+i <- chr
+	tt <- which(ratio$Chromosome == i)
+	if (length(tt)>0) {
+		 plot(ratio$Start[tt],ratio$Ratio[tt]*ploidy,ylim = c(0,maxLevelToPlot*ploidy),xlab = paste ("position, chr",i),ylab = "normalized copy number profile",pch = ".",col = colors()[88],cex=2)
+		 tt <- which(ratio$Chromosome==i  & ratio$CopyNumber>ploidy )
+		 points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=2)
+		 tt <- which(ratio$Chromosome==i  & ratio$Ratio==maxLevelToPlot)
+		 points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
+		 tt <- which(ratio$Chromosome==i  & ratio$CopyNumber<ploidy )
+		 points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[461],cex=2)
+		 tt <- which(ratio$Chromosome==i)
+		 points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=2)
+		 points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[98],cex=4)
+	}
+if (length(args)>=7) {
+	dataTable <-read.table(args[7], header=TRUE);
+	BAF<-data.frame(dataTable)
+	tt <- which(BAF$Chromosome==i)
+	lBAF <-BAF[tt,]
+	plot(lBAF$Position,lBAF$BAF,ylim = c(-0.1,1.1),xlab = paste ("position, chr",i),ylab = "BAF",pch = ".",col = colors()[1])
+	tt <- which(lBAF$A==0.5)		
+	points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92])
+	tt <- which(lBAF$A!=0.5 & lBAF$A>=0)
+	points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[450])
+	tt <- which(lBAF$B==1)
+	points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62])
+	tt <- 1
+	pres <- 1
+	for (j in c(2:(length(lBAF$A)-pres-1))) {
+			if (lBAF$A[j]==lBAF$A[j+pres]) {	
+				tt[length(tt)+1] <- j 
+			}
+	}
+	points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4)
+	points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4)
+	print( paste (args[7],"read"))
+}
+dev.off()
+
diff --git a/scripts/vcf2snpFreec.pl b/scripts/vcf2snpFreec.pl
new file mode 100644
index 0000000..14465c9
--- /dev/null
+++ b/scripts/vcf2snpFreec.pl
@@ -0,0 +1,95 @@
+#!/usr/bin/perl -w
+#translate dbSNP vcf file into the format acceptable by FREEC (for dbSNP files)
+
+use strict;
+
+my $usage = qq{
+    $0
+    
+    -f	 file            	dbSNP vcf file
+    
+####
+
+    OUTPUT will be in the standard output!!! pipe it if you need with gzip!!!
+    
+};
+
+if(scalar(@ARGV) == 0){
+    print $usage;
+    exit(0);
+}
+
+## mandatory arguments
+
+my $filename = "";
+
+
+
+## parse command line arguments
+
+while(scalar(@ARGV) > 0){
+    my $this_arg = shift @ARGV;
+    if ( $this_arg eq '-h') {print "$usage\n"; exit; }
+
+    elsif ( $this_arg eq '-f') {$filename = shift @ARGV;}
+    
+
+    elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
+}
+
+if( $filename eq ""){
+    die "you should specify VCF file from dbSNP\n";
+}
+   
+ 
+
+#-----------read file with pairs-----
+
+if ($filename =~ /.gz$/) {
+        open(FILE, "gunzip -c $filename |") or die "$0: can't open ".$filename.":$!\n";
+} else {
+        open (FILE, "<$filename") or die "Cannot open file $filename!!!!: $!";
+}
+
+##INFO=<ID=MUT,Number=0,Type=Flag,Description="Is mutation (journal citation, explicit fact): a low frequency variation that is cited in journal and other reputable sources">
+#1	10389	rs373144384	AC	A	.	.	RS=373144384;RSPOS=10390;dbSNPBuildID=138;SSR=0;SAO=0;VP=0x050000020005000002000200;WGT=1;VC=DIV;R5;ASP
+#1	10433	rs56289060	A	AC	.	.	RS=56289060;RSPOS=10433;dbSNPBuildID=129;SSR=0;SAO=0;VP=0x050000020005000002000200;WGT=1;VC=DIV;R5;ASP
+#1	10440	rs112155239	C	A	.	.	RS=112155239;RSPOS=10440;dbSNPBuildID=132;SSR=0;SAO=0;VP=0x050000020005000002000100;WGT=1;VC=SNV;R5;ASP
+
+# to
+
+#chr1	11017	G/T	+	G	rs78927064
+#chr1	11019	G/T	+	T	rs79282076
+#chr1	11022	A/G	+	G	rs28775022
+#chr1	11022	A/G	+	G	rs62636509
+
+
+my ($chr,$start,$possibilities,$strand,$refAllele,$ID) = ("","","","+","","");
+
+my $numberOfSites = 0;
+my $totalCount=0;
+my $lines = 0;
+
+my($dot,$dot2,$info,$AltAllele);
+
+while (<FILE>) {	
+    $lines++;
+    next if (m/^\#\#/);
+    chomp;
+    ($chr,$start,$ID,$refAllele,$AltAllele,$dot,$dot2,$info) = split /\t/;      
+    $totalCount++;    
+    next unless (length($refAllele)==1 && length($AltAllele)==1);
+    $numberOfSites++;
+    unless($chr =~ m/^chr/) {
+	$chr="chr".$chr;
+    }
+    
+    $possibilities = $refAllele."/".$AltAllele;
+    print "$chr\t$start\t$possibilities\t$strand\t$refAllele\t$ID\n";
+}
+close FILE;
+
+
+print STDERR "Read: $filename\tlines: $lines;\ttotal sites: $totalCount\taccepted sites: $numberOfSites\n";
+
+