From 71f72426ee95cb8267ee3729bd8ea886936fa801 Mon Sep 17 00:00:00 2001 From: Valentina Boeva Date: Tue, 2 Aug 2016 15:25:29 +0200 Subject: [PATCH] add scripts --- scripts/assess_significance.R | 60 +++++++++++++ scripts/freec2bed.pl | 139 +++++++++++++++++++++++++++++ scripts/freec2circos.pl | 156 +++++++++++++++++++++++++++++++++ scripts/get_fasta_lengths.pl | 44 ++++++++++ scripts/makeGraph.R | 97 ++++++++++++++++++++ scripts/makeGraph_Chromosome.R | 59 +++++++++++++ scripts/vcf2snpFreec.pl | 95 ++++++++++++++++++++ 7 files changed, 650 insertions(+) create mode 100644 scripts/assess_significance.R create mode 100644 scripts/freec2bed.pl create mode 100644 scripts/freec2circos.pl create mode 100644 scripts/get_fasta_lengths.pl create mode 100644 scripts/makeGraph.R create mode 100644 scripts/makeGraph_Chromosome.R create mode 100644 scripts/vcf2snpFreec.pl diff --git a/scripts/assess_significance.R b/scripts/assess_significance.R new file mode 100644 index 0000000..f466024 --- /dev/null +++ b/scripts/assess_significance.R @@ -0,0 +1,60 @@ +library(rtracklayer) + +args <- commandArgs() + +dataTable <-read.table(args[5], header=TRUE); +ratio<-data.frame(dataTable) + +dataTable <- read.table(args[4], header=FALSE) +cnvs<- data.frame(dataTable) + +ratio$Ratio[which(ratio$Ratio==-1)]=NA + +cnvs.bed=GRanges(cnvs[,1],IRanges(cnvs[,2],cnvs[,3])) +ratio.bed=GRanges(ratio$Chromosome,IRanges(ratio$Start,ratio$Start),score=ratio$Ratio) + +overlaps <- subsetByOverlaps(ratio.bed,cnvs.bed) +normals <- setdiff(ratio.bed,cnvs.bed) +normals <- subsetByOverlaps(ratio.bed,normals) + +#mu <- mean(score(normals),na.rm=TRUE) +#sigma<- sd(score(normals),na.rm=TRUE) + +#hist(score(normals),n=500,xlim=c(0,2)) +#hist(log(score(normals)),n=500,xlim=c(-1,1)) + +#shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000]) +#qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10))) +#qqline(score(normals)[which(!is.na(score(normals)))], col = 2) + +#shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000]) +#qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10))) +#qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2) + +numberOfCol=length(cnvs) + +for (i in c(1:length(cnvs[,1]))) { + values <- score(subsetByOverlaps(ratio.bed,cnvs.bed[i])) + #wilcox.test(values,mu=mu) + W <- function(values,normals){resultw <- try(wilcox.test(values,score(normals)), silent = TRUE) + if(class(resultw)=="try-error") return(list("statistic"=NA,"parameter"=NA,"p.value"=NA,"null.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultw} + KS <- function(values,normals){resultks <- try(ks.test(values,score(normals)), silent = TRUE) + if(class(resultks)=="try-error") return(list("statistic"=NA,"p.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultks} + #resultks <- try(KS <- ks.test(values,score(normals)), silent = TRUE) + # if(class(resultks)=="try-error") NA) else resultks + cnvs[i,numberOfCol+1]=W(values,normals)$p.value + cnvs[i,numberOfCol+2]=KS(values,normals)$p.value + } + +if (numberOfCol==5) { + names(cnvs)=c("chr","start","end","copy number","status","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue") +} +if (numberOfCol==7) { + names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue") +} +if (numberOfCol==9) { + names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","somatic/germline","precentageOfGermline","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue") +} +write.table(cnvs, file=paste(args[4],".p.value.txt",sep=""),sep="\t",quote=F,row.names=F) + + diff --git a/scripts/freec2bed.pl b/scripts/freec2bed.pl new file mode 100644 index 0000000..431929f --- /dev/null +++ b/scripts/freec2bed.pl @@ -0,0 +1,139 @@ +#!/usr/bin/perl -w +#translate *_ratio.txt (the out put of FREEC) into a BED track + +use strict; + +my $usage = qq{ + $0 + + ----------------------------- + mandatory parameters: + + -f file file with ratio + -p ploidy ploidy (default 2) + + ----------------------------- + optional parameters: + + -v verbose mode + +}; + +if(scalar(@ARGV) == 0){ + print $usage; + exit(0); +} + +## arguments + +my $filename = ""; +my $verbose = 0; +my $ploidy = 2; + +## parse command line arguments + +while(scalar(@ARGV) > 0){ + my $this_arg = shift @ARGV; + if ( $this_arg eq '-h') {print "$usage\n"; exit; } + + elsif ( $this_arg eq '-f') {$filename = shift @ARGV;} + elsif ( $this_arg eq '-p') {$ploidy = shift @ARGV;} + + elsif ( $this_arg eq '-v') {$verbose = 1;} + + elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} +} + +if( $filename eq ""){ + die "you should specify file with ratio\n"; +} + +#-----------read file with pairs----- +my $numberOfAllSites = 0; +my $totalCount=0; +my $lines = 0; + + +#open (OUT , ">$name") or die "Cannot open file $name!!!!: $!"; + +if ($filename eq "") { + print "Please specify a file with ratio!\n" if ($verbose) ; + exit(); +} + + +if ($filename =~ /.gz$/) { + open(FILE, "gunzip -c $filename |") or die "$0: can't open ".$filename.":$!\n"; +} else { + open (FILE, "<$filename") or die "Cannot open file $filename!!!!: $!"; +} + +#Chromosome Start Ratio MedianRatio CopyNumber +#1 1 -1 1.03966 2 +#1 1001 -1 1.03966 2 + +# to + +#chromA chromStartA chromEndA dataValueA + + + +my ($chr,$start,$ratio,$MedRatio,$CPN) = ("","","","",""); + +my $eventStart = ""; +my $eventEnd = ""; +my $eventValue = ""; +my $eventChr = ""; +my $count = 0; + +while () { + chomp; + ($chr,$start,$ratio,$MedRatio,$CPN) = split /\t/; + next if ($chr =~ m/^Chromosome/); + $totalCount++; + $count++; + if ($eventStart eq "") { + $eventStart = $start; + $eventEnd = $start; + $eventValue = $MedRatio; + $eventChr =$chr; + $count =0; + + } else { + if ($eventChr ne $chr) { + #new Chromosome, print the last event + + print "chr$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0); + + $eventStart = $start; + $eventEnd = $start; + $eventValue = $MedRatio; + $eventChr =$chr; + $count =0; + + } else { + if ($eventValue ne $MedRatio) { + #new event, print the last event + print "chr$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0); + + $eventStart = $start; + $eventEnd = $start; + $eventValue = $MedRatio; + $eventChr =$chr; + $count =0; + + } else { + #move forward + $eventEnd = $start; + } + } + } +} +close FILE; + +#print the last event +print "chr$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0); + +print STDERR "Read: $filename\t$totalCount\n"; + + diff --git a/scripts/freec2circos.pl b/scripts/freec2circos.pl new file mode 100644 index 0000000..6945256 --- /dev/null +++ b/scripts/freec2circos.pl @@ -0,0 +1,156 @@ +#!/usr/bin/perl -w +#translate "*_ratio.txt" (output of FREEC) into a Circos track + +use strict; + +my $usage = qq{ + $0 + + ----------------------------- + mandatory parameters: + + -f file file with ratio + -p ploidy ploidy (default 2) + + ----------------------------- + optional parameters: + + -v verbose mode + +}; + +if(scalar(@ARGV) == 0){ + print $usage; + exit(0); +} + +## arguments + +my $filename = ""; +my $verbose = 0; +my $ploidy = 2; + +## parse command line arguments + +while(scalar(@ARGV) > 0){ + my $this_arg = shift @ARGV; + if ( $this_arg eq '-h') {print "$usage\n"; exit; } + + elsif ( $this_arg eq '-f') {$filename = shift @ARGV;} + elsif ( $this_arg eq '-p') {$ploidy = shift @ARGV;} + + elsif ( $this_arg eq '-v') {$verbose = 1;} + + elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} +} + +if( $filename eq ""){ + die "you should specify file with ratio\n"; +} + + + +#-----------read file with pairs----- +my $numberOfAllSites = 0; +my $totalCount=0; +my $lines = 0; + + +#open (OUT , ">$name") or die "Cannot open file $name!!!!: $!"; + +if ($filename eq "") { + print "Please specify a file with ratio!\n" if ($verbose) ; + exit(); +} + + +if ($filename =~ /.gz$/) { + open(FILE, "gunzip -c $filename |") or die "$0: can't open ".$filename.":$!\n"; +} else { + open (FILE, "<$filename") or die "Cannot open file $filename!!!!: $!"; +} + +#Chromosome Start Ratio MedianRatio CopyNumber +#1 1 -1 1.03966 2 +#1 1001 -1 1.03966 2 + +# to + +#hsY 0 999999 0.005786 +#hsY 1000000 1999999 0.003958 +#hsY 2000000 2999999 0.002578 +#hsY 3000000 3999999 0.007451 + + +my ($chr,$start,$ratio,$MedRatio,$CPN) = ("","","","",""); + +my $eventStart = ""; +my $eventEnd = ""; +my $eventValue = ""; +my $eventChr = ""; +my $count = 0; +my $eventRatio = ""; + +while () { + chomp; + ($chr,$start,$ratio,$MedRatio,$CPN) = split /\t/; + next if ($chr =~ m/^Chromosome/); + $totalCount++; + $count++; + if ($eventStart eq "") { + $eventStart = $start; + $eventEnd = $start+999; + $eventValue = $MedRatio; + $eventRatio = $ratio; + $eventChr =$chr; + $count =0; + print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventRatio >= 0); + + } else { + if ($eventChr ne $chr) { + #new Chromosome, print the last event + + print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0); + + $eventStart = $start; + $eventEnd = $start+999; + $eventValue = $MedRatio; + $eventRatio = $ratio; + $eventChr =$chr; + $count =0; + print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventRatio >= 0); + + } else { + if ($eventValue ne $MedRatio) { + #new event, print the last event + print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventValue >= 0); + + $eventStart = $start; + $eventEnd = $start+999; + $eventValue = $MedRatio; + $eventRatio = $ratio; + $eventChr =$chr; + $count =0; + print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventRatio >= 0); + + } else { + #move forward + $eventEnd = $start+999;; + } + } + if ($count>=10) { + $count =0; + print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($ratio >= 0); #($eventValue >= 0); $eventRatio >0 + $eventStart=$eventEnd; + + } + } +} +close FILE; + +#print the last event +print "hs$eventChr $eventStart $eventEnd ".$eventValue*$ploidy."\n" if ($eventRatio >= 0); + +print STDERR "Read: $filename\t$totalCount\n"; + + diff --git a/scripts/get_fasta_lengths.pl b/scripts/get_fasta_lengths.pl new file mode 100644 index 0000000..97c889a --- /dev/null +++ b/scripts/get_fasta_lengths.pl @@ -0,0 +1,44 @@ +#!/usr/bin/perl -w +#prints lengths of sequences from a multifasta file + +use strict; + +my $Filename; + +if ($ARGV[0]) { + $Filename = $ARGV[0]; +} +else { + print "Not enough arguments\n\n"; + die "!\n\n"; +} + +my $result_file=$Filename; +$result_file =~ s/.*\\//; +$result_file =~ s/.*\///; +$result_file="res_".$result_file; +open (FILE, "<$Filename") or die "Cannot open file!!!!: $!"; +open (OUT, ">$result_file") or die "Cannot open file!!!!: $!"; + +$_ = ; +if (/>(.*)/) { + print OUT $1,"\t"; +} +my $length = 0; + +while () { + chomp; + if (/>(.*)/) { + print OUT $length,"\n"; + $length = 0; + print OUT $1,"\t"; + } + else { + $length += length($_); + } +} + +print OUT $length,"\n"; + +close FILE; +close OUT; diff --git a/scripts/makeGraph.R b/scripts/makeGraph.R new file mode 100644 index 0000000..7a69d47 --- /dev/null +++ b/scripts/makeGraph.R @@ -0,0 +1,97 @@ +args <- commandArgs() + +dataTable <-read.table(args[5], header=TRUE); + +ratio<-data.frame(dataTable) +ploidy <- type.convert(args[4]) + +png(filename = paste(args[5],".png",sep = ""), width = 1180, height = 1180, + units = "px", pointsize = 20, bg = "white", res = NA) +plot(1:10) +op <- par(mfrow = c(5,5)) + +maxLevelToPlot <- 3 +for (i in c(1:length(ratio$Ratio))) { + if (ratio$Ratio[i]>maxLevelToPlot) { + ratio$Ratio[i]=maxLevelToPlot; + } +} + + +for (i in c(1:22,'X','Y')) { + tt <- which(ratio$Chromosome==i) + if (length(tt)>0) { + plot(ratio$Start[tt],ratio$Ratio[tt]*ploidy,ylim = c(0,maxLevelToPlot*ploidy),xlab = paste ("position, chr",i),ylab = "normalized copy number profile",pch = ".",col = colors()[88]) + tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy ) + points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136]) + + tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy) + points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4) + + tt <- which(ratio$Chromosome==i & ratio$CopyNumber5) { + dataTable <-read.table(args[6], header=TRUE); + BAF<-data.frame(dataTable) + + png(filename = paste(args[6],".png",sep = ""), width = 1180, height = 1180, + units = "px", pointsize = 20, bg = "white", res = NA) + plot(1:10) + op <- par(mfrow = c(5,5)) + + for (i in c(1:22,'X','Y')) { + tt <- which(BAF$Chromosome==i) + if (length(tt)>0){ + lBAF <-BAF[tt,] + plot(lBAF$Position,lBAF$BAF,ylim = c(-0.1,1.1),xlab = paste ("position, chr",i),ylab = "BAF",pch = ".",col = colors()[1]) + + tt <- which(lBAF$A==0.5) + points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92]) + tt <- which(lBAF$A!=0.5 & lBAF$A>=0) + points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62]) + tt <- 1 + pres <- 1 + + if (length(lBAF$A)>4) { + for (j in c(2:(length(lBAF$A)-pres-1))) { + if (lBAF$A[j]==lBAF$A[j+pres]) { + tt[length(tt)+1] <- j + } + } + points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4) + points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4) + } + + tt <- 1 + pres <- 1 + if (length(lBAF$FittedA)>4) { + for (j in c(2:(length(lBAF$FittedA)-pres-1))) { + if (lBAF$FittedA[j]==lBAF$FittedA[j+pres]) { + tt[length(tt)+1] <- j + } + } + points(lBAF$Position[tt],lBAF$FittedA[tt],pch = ".",col = colors()[463],cex=4) + points(lBAF$Position[tt],lBAF$FittedB[tt],pch = ".",col = colors()[463],cex=4) + } + + } + + } + dev.off() + +} diff --git a/scripts/makeGraph_Chromosome.R b/scripts/makeGraph_Chromosome.R new file mode 100644 index 0000000..1c45aed --- /dev/null +++ b/scripts/makeGraph_Chromosome.R @@ -0,0 +1,59 @@ +args <- commandArgs() + +dataTable <-read.table(args[6], header=TRUE); +print( paste (args[6],"read")) +ratio<-data.frame(dataTable) +chr <- type.convert(args[4]) +#chr <- 'X' +ploidy <- type.convert(args[5]) + +maxLevelToPlot <- 3 +for (i in c(1:length(ratio$Ratio))) { + if (ratio$Ratio[i]>maxLevelToPlot) { + ratio$Ratio[i]=maxLevelToPlot; + } +} + +png(filename = paste(args[6],".png",sep = ""), width = 1180, height = 1180, + units = "px", pointsize = 20, bg = "white", res = NA) +plot(1:10) +op <- par(mfrow = c(2,1)) +i <- chr + tt <- which(ratio$Chromosome == i) + if (length(tt)>0) { + plot(ratio$Start[tt],ratio$Ratio[tt]*ploidy,ylim = c(0,maxLevelToPlot*ploidy),xlab = paste ("position, chr",i),ylab = "normalized copy number profile",pch = ".",col = colors()[88],cex=2) + tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy ) + points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=2) + tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot) + points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4) + tt <- which(ratio$Chromosome==i & ratio$CopyNumber=7) { + dataTable <-read.table(args[7], header=TRUE); + BAF<-data.frame(dataTable) + tt <- which(BAF$Chromosome==i) + lBAF <-BAF[tt,] + plot(lBAF$Position,lBAF$BAF,ylim = c(-0.1,1.1),xlab = paste ("position, chr",i),ylab = "BAF",pch = ".",col = colors()[1]) + tt <- which(lBAF$A==0.5) + points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92]) + tt <- which(lBAF$A!=0.5 & lBAF$A>=0) + points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[450]) + tt <- which(lBAF$B==1) + points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62]) + tt <- 1 + pres <- 1 + for (j in c(2:(length(lBAF$A)-pres-1))) { + if (lBAF$A[j]==lBAF$A[j+pres]) { + tt[length(tt)+1] <- j + } + } + points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4) + points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4) + print( paste (args[7],"read")) +} +dev.off() + diff --git a/scripts/vcf2snpFreec.pl b/scripts/vcf2snpFreec.pl new file mode 100644 index 0000000..14465c9 --- /dev/null +++ b/scripts/vcf2snpFreec.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl -w +#translate dbSNP vcf file into the format acceptable by FREEC (for dbSNP files) + +use strict; + +my $usage = qq{ + $0 + + -f file dbSNP vcf file + +#### + + OUTPUT will be in the standard output!!! pipe it if you need with gzip!!! + +}; + +if(scalar(@ARGV) == 0){ + print $usage; + exit(0); +} + +## mandatory arguments + +my $filename = ""; + + + +## parse command line arguments + +while(scalar(@ARGV) > 0){ + my $this_arg = shift @ARGV; + if ( $this_arg eq '-h') {print "$usage\n"; exit; } + + elsif ( $this_arg eq '-f') {$filename = shift @ARGV;} + + + elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} +} + +if( $filename eq ""){ + die "you should specify VCF file from dbSNP\n"; +} + + + +#-----------read file with pairs----- + +if ($filename =~ /.gz$/) { + open(FILE, "gunzip -c $filename |") or die "$0: can't open ".$filename.":$!\n"; +} else { + open (FILE, "<$filename") or die "Cannot open file $filename!!!!: $!"; +} + +##INFO= +#1 10389 rs373144384 AC A . . RS=373144384;RSPOS=10390;dbSNPBuildID=138;SSR=0;SAO=0;VP=0x050000020005000002000200;WGT=1;VC=DIV;R5;ASP +#1 10433 rs56289060 A AC . . RS=56289060;RSPOS=10433;dbSNPBuildID=129;SSR=0;SAO=0;VP=0x050000020005000002000200;WGT=1;VC=DIV;R5;ASP +#1 10440 rs112155239 C A . . RS=112155239;RSPOS=10440;dbSNPBuildID=132;SSR=0;SAO=0;VP=0x050000020005000002000100;WGT=1;VC=SNV;R5;ASP + +# to + +#chr1 11017 G/T + G rs78927064 +#chr1 11019 G/T + T rs79282076 +#chr1 11022 A/G + G rs28775022 +#chr1 11022 A/G + G rs62636509 + + +my ($chr,$start,$possibilities,$strand,$refAllele,$ID) = ("","","","+","",""); + +my $numberOfSites = 0; +my $totalCount=0; +my $lines = 0; + +my($dot,$dot2,$info,$AltAllele); + +while () { + $lines++; + next if (m/^\#\#/); + chomp; + ($chr,$start,$ID,$refAllele,$AltAllele,$dot,$dot2,$info) = split /\t/; + $totalCount++; + next unless (length($refAllele)==1 && length($AltAllele)==1); + $numberOfSites++; + unless($chr =~ m/^chr/) { + $chr="chr".$chr; + } + + $possibilities = $refAllele."/".$AltAllele; + print "$chr\t$start\t$possibilities\t$strand\t$refAllele\t$ID\n"; +} +close FILE; + + +print STDERR "Read: $filename\tlines: $lines;\ttotal sites: $totalCount\taccepted sites: $numberOfSites\n"; + +