Skip to content

Commit

Permalink
Merge pull request #16 from FredHutch/mpmeers-patch-191006
Browse files Browse the repository at this point in the history
Update from v1.1 to v1.2
  • Loading branch information
mpmeers authored Oct 30, 2019
2 parents 6baf05a + acf8116 commit 8ed4b16
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 25 deletions.
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,20 @@ A web interface for SEACR analysis can be found at https://seacr.fredhutch.org

## Recent changes

### v1.2

- Fixed a bug in lines 166 and 168 in which misplaced brackets caused the misreporting of the max signal region terminal coordinate for merged signal blocks
- Added a counter to keep track of the number of component bedgraph lines that compose each signal block, and a function to calculate the minimum threshold of lines per signal block at which there is a smaller percentage of target signal blocks remaining than control. This is meant to be used as a filter for signal blocks that pass the total signal threshold despite being composed of very few bedgraph lines, which are unlikely to be true peaks.
- Changed how the dataframe for density plotting is truncated (previously a hard-coded 90% cutoff): a dataframe of list quantile (i.e. line #/max line#) vs. value quantile (i.e. value/max value) is derived, and the threshold is selected by finding the dataframe pair for which the orthogonal distance below the line defined by (0,0);(1,1) is maximized.

### v1.1
- Changed "union" and "AUC" modes to "relaxed" and "stringent" modes, respectively.
- Removed maximum signal threshold from "relaxed" mode and replaced it with an alternate total signal threshold that uses the point halfway between the knee and the peak of the total signal curve as described in the manuscript text. This change improves performance at high read depth.
- Implemented alternate threshold test that searches for any thresholds that come within 95% of the optimal threshold. This change avoids spurious thresholds that are overselective in some datasets.

## Usage:

bash SEACR_1.1.sh experimental bedgraph [control bedgraph | numeric threshold] ["norm" | "non"] ["relaxed" | "stringent"] output prefix
bash SEACR_1.2.sh experimental bedgraph [control bedgraph | numeric threshold] ["norm" | "non"] ["relaxed" | "stringent"] output prefix

## Description of input fields:

Expand All @@ -44,7 +50,7 @@ Here is some example code for converting from a paired-end BAM to a fragment bed

bedtools bamtobed -bedpe -i $sample.bam > $sample.bed
awk '$1==$4 && $6-$2 < 1000 {print $0}' $sample.bed > $sample.clean.bed
cut -f 1,2,6 $sample.clean.bed > $sample.fragments.bed
cut -f 1,2,6 $sample.clean.bed | sort -k1,1 -k2,2n -k3,3n > $sample.fragments.bed
bedtools genomecov -bg -i $sample.fragments.bed -g my.genome > $sample.fragments.bedgraph

## Output file:
Expand All @@ -70,11 +76,11 @@ Field 6: Region representing the farthest upstream and farthest downstream bases

## Examples:

bash SEACR_1.1.sh target.bedgraph IgG.bedgraph norm stringent output
bash SEACR_1.2.sh target.bedgraph IgG.bedgraph norm stringent output
Calls enriched regions in target data using normalized IgG control track with stringent threshold

bash SEACR_1.1.sh target.bedgraph IgG.bedgraph non relaxed output
bash SEACR_1.2.sh target.bedgraph IgG.bedgraph non relaxed output
Calls enriched regions in target data using non-normalized IgG control track with relaxed threshold

bash SEACR_1.1.sh target.bedgraph 0.01 non stringent output
bash SEACR_1.2.sh target.bedgraph 0.01 non stringent output
Calls enriched regions in target data by selecting the top 1% of regions by AUC
28 changes: 25 additions & 3 deletions SEACR_1.1.R → SEACR_1.2.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,29 @@ if(is.na(numtest)){ ## If 2nd field is a bedgraph, calculate empirical threshold
# print("Ctrl is a file")
ctrl<-read.table(argsL$ctrl)
ctrlvec<-ctrl$V1
ctrlmax<-ctrl$V2
rm(ctrl)
invis <- gc(verbose=FALSE)
if(argsL$norm=="yes"){ ## Calculate peaks of density plots to generate normalization factor
ctrlvalue<-sort(ctrlvec)[as.integer(0.9*length(ctrlvec))] ## Added 7/15/19 to improve memory performance
expvalue<-sort(expvec)[as.integer(0.9*length(expvec))] ## Added 7/15/19 to improve memory performance
dist2d<-function(a,b,c){v1<- b - c; v2<- a - b; m<-cbind(v1,v2); d<-det(m)/sqrt(sum(v1*v1))}
expframe<-data.frame(count=seq(1,0,length=length(expvec)), quant=sort(expvec,decreasing=TRUE)/max(expvec), value=sort(expvec,decreasing=TRUE))
expframe$diff<-abs(expframe$count-expframe$quant)
expframe<-expframe[expframe$diff > 0.9*max(expframe$diff),]
expframe$dist<-apply(expframe,1,function(x) dist2d(c(x[1],x[2]),0,1))
ctrlframe<-data.frame(count=seq(1,0,length=length(ctrlvec)), quant=sort(ctrlvec,decreasing=TRUE)/max(ctrlvec), value=sort(ctrlvec,decreasing=TRUE))
ctrlframe$diff<-abs(ctrlframe$count-ctrlframe$quant)
ctrlframe<-ctrlframe[ctrlframe$diff > 0.9*max(ctrlframe$diff),]
ctrlframe$dist<-apply(ctrlframe,1,function(x) dist2d(c(x[1],x[2]),0,1))
if(ctrlframe$value[ctrlframe$dist==max(ctrlframe$dist)][1] > sort(ctrlvec)[as.integer(0.9*length(ctrlvec))]){
ctrlvalue<-ctrlframe$value[ctrlframe$dist==max(ctrlframe$dist)][1]
}else{
ctrlvalue<-sort(ctrlvec)[as.integer(0.9*length(ctrlvec))] ## Added 7/15/19 to improve memory performance
}
if(expframe$value[expframe$dist==max(expframe$dist)][1] > sort(expvec)[as.integer(0.9*length(expvec))]){
expvalue<-expframe$value[expframe$dist==max(expframe$dist)][1]
}else{
expvalue<-sort(expvec)[as.integer(0.9*length(expvec))] ## Added 7/15/19 to improve memory performance
}
ctrltest<-density(ctrlvec[ctrlvec <= ctrlvalue]) ## New for SEACR_1.1
exptest<-density(expvec[expvec <= expvalue]) ## New for SEACR_1.1
constant<-(exptest$x[exptest$y==max(exptest$y)])/(ctrltest$x[ctrltest$y==max(ctrltest$y)])
Expand Down Expand Up @@ -104,6 +122,10 @@ if(is.na(numtest)){ ## If 2nd field is a bedgraph, calculate empirical threshold
x0<-a0
z0<-b0
}
both2<-c(expmax,ctrlmax)
d<-sort(unique(both2))
pctremain2<-function(x) 1-(ecdf(expmax)(x)-ecdf(ctrlmax)(x))
d0<-min(d[pctremain2(d) > 1])
invis <- gc(verbose=FALSE)
fdr<-c(1-pctremain(x0[1]), 1-pctremain(z0[1])) ## New for SEACR_1.1
}else{ ## If 2nd field is numeric, calculate percentile threshold
Expand All @@ -118,7 +140,7 @@ if(is.na(numtest)){ ## If 2nd field is a bedgraph, calculate empirical threshold
fdr<-ctrl[1] ## New for SEACR_1.1
}
invis <- gc(verbose=FALSE)
write.table(c(x0[1],z0[1]), file=paste(argsL$output, ".threshold.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
write.table(c(x0[1],z0[1],d0[1]), file=paste(argsL$output, ".threshold.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
if(argsL$norm=="yes"){
write.table(constant, file=paste(argsL$output, ".norm.txt", sep=""), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) #Added 7/19/18 to ensure norm value is multiplied by ctrl
}
Expand Down
37 changes: 20 additions & 17 deletions SEACR_1.1.sh → SEACR_1.2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ then
echo "
SEACR: Sparse Enrichment Analysis for CUT&RUN
Usage: bash SEACR_1.1.sh <experimental bedgraph>.bg [<control bedgraph>.bg | <FDR threshold>] ["norm" | "non"] ["relaxed" | "stringent"] output prefix
Usage: bash SEACR_1.2.sh <experimental bedgraph>.bg [<control bedgraph>.bg | <FDR threshold>] ["norm" | "non"] ["relaxed" | "stringent"] output prefix
Description of input fields:
Expand Down Expand Up @@ -42,12 +42,12 @@ then
Field 6: Region representing the farthest upstream and farthest downstream bases within the denoted coordinates that are represented by the maximum bedgraph signal
Examples:
bash SEACR_1.1.sh target.bedgraph IgG.bedgraph norm stringent output
bash SEACR_1.2.sh target.bedgraph IgG.bedgraph norm stringent output
Calls enriched regions in target data using normalized IgG control track with stringent threshold
bash SEACR_1.1.sh target.bedgraph IgG.bedgraph non relaxed output
bash SEACR_1.2.sh target.bedgraph IgG.bedgraph non relaxed output
Calls enriched regions in target data using non-normalized IgG control track with relaxed threshold
bash SEACR_1.1.sh target.bedgraph 0.01 non stringent output
bash SEACR_1.2.sh target.bedgraph 0.01 non stringent output
Calls enriched regions in target data by selecting the top 1% of regions by area under the curve (AUC)
"
exit 1
Expand Down Expand Up @@ -98,15 +98,17 @@ fi

echo "Creating experimental AUC file: $(date)"

awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); s++}else{if(chr==$1 && $2==stop){stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"$3}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2)}}}' $1 > $password.auc.bed
cut -f 4,5 $password.auc.bed > $password.auc
awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); num=1; s++}else{if(chr==$1 && $2==stop){num++; stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"$3
}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord"\t"num; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); num=1}}}' $1 > $password.auc.bed
cut -f 4,7 $password.auc.bed > $password.auc

if [[ -f $2 ]]
then
echo "Creating control AUC file: $(date)"
echo "Creating control AUC file: $(date)"

awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); s++}else{if(chr==$1 && $2==stop){stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"$3}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2)}}}' $2 > $password2.auc.bed
cut -f 4,5 $password2.auc.bed > $password2.auc
awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); num=1; s++}else{if(chr==$1 && $2==stop){num++; stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"
$3}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord"\t"num; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); num=1}}}' $2 > $password2.auc.bed
cut -f 4,7 $password2.auc.bed > $password2.auc
fi

# module load R ## For use on cluster
Expand All @@ -133,24 +135,25 @@ fdr2=`cat $password.fdr.txt | sed -n '2p'` ## Added 5/15/19 for SEACR_1.1
#thresh=`cat $exp.threshold.txt`
thresh=`cat $password.threshold.txt | sed -n '1p'`
thresh2=`cat $password.threshold.txt | sed -n '2p'`
thresh3=`cat $password.threshold.txt | sed -n '3p'`

echo "Creating thresholded feature file: $(date)"

if [[ $height == "relaxed" ]]
if [[ $height == "relaxed" ]]
then
echo "Empirical false discovery rate = $fdr2"
awk -v value=$thresh2 '$4 > value {print $0}' $password.auc.bed > $password.auc.threshold.bed
echo "Empirical false discovery rate = $fdr2"
awk -v value=$thresh2 -v value2=$thresh3 '$4 > value && $7 > value2 {print $0}' $password.auc.bed | cut -f 1,2,3,4,5,6 > $password.auc.threshold.bed
else
echo "Empirical false discovery rate = $fdr"
awk -v value=$thresh '$4 > value {print $0}' $password.auc.bed > $password.auc.threshold.bed
echo "Empirical false discovery rate = $fdr"
awk -v value=$thresh -v value2=$thresh3 '$4 > value && $7 > value2 {print $0}' $password.auc.bed | cut -f 1,2,3,4,5,6 > $password.auc.threshold.bed
fi

if [[ -f $2 ]]
then
if [[ $norm == "norm" ]] #If normalizing, multiply control bedgraph by normalization constant
then
constant=`cat $password.norm.txt | sed -n '1p'`
awk -v mult=$constant 'BEGIN{OFS="\t"}; {$4=$4*mult; print $0}' $password2.auc.bed > $password2.auc2.bed
awk -v mult=$constant 'BEGIN{OFS="\t"}; {$4=$4*mult; print $0}' $password2.auc.bed | cut -f 1,2,3,4,5,6 > $password2.auc2.bed
mv $password2.auc2.bed $password2.auc.bed
fi
awk -v value=$thresh '$4 > value {print $0}' $password2.auc.bed > $password2.auc.threshold.bed
Expand All @@ -163,9 +166,9 @@ mean=`awk '{s+=$3-$2; t++}END{print s/(t*10)}' $password.auc.threshold.bed`

if [[ -f $2 ]]
then
awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"}u[2]}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed | bedtools intersect -wa -v -a - -b $password2.auc.threshold.bed > $5.auc.threshold.merge.bed
awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"u[2]}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed | bedtools intersect -wa -v -a - -b $password2.auc.threshold.bed > $5.auc.threshold.merge.bed
else
awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"}u[2]}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed > $5.auc.threshold.merge.bed
awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"u[2]}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed > $5.auc.threshold.merge.bed
fi

if [[ $height == "relaxed" ]]
Expand Down

0 comments on commit 8ed4b16

Please sign in to comment.