Skip to content

Commit

Permalink
Merge pull request #113 from reichan1998/cram_handling
Browse files Browse the repository at this point in the history
Implement cram chunking and minimap2-based Hi-C alignments
  • Loading branch information
tkchafin authored Sep 17, 2024
2 parents 6c6505d + c93c6bc commit 51a1d9f
Show file tree
Hide file tree
Showing 33 changed files with 1,508 additions and 34 deletions.
1 change: 1 addition & 0 deletions bin/awk_filter_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
109 changes: 109 additions & 0 deletions bin/filter_five_end.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/perl
use strict;
use warnings;

my $prev_id = "";
my @five;
my @three;
my @unmap;
my @mid;
my @all;
my $counter = 0;

while (<STDIN>){
chomp;
if (/^@/){
print $_."\n";
next;
}
my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/;
my $bin = reverse(dec2bin($flag));
my @binary = split(//,$bin);
if ($prev_id ne $id && $prev_id ne ""){
if ($counter == 1){
if (@five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}
}
elsif ($counter == 2 && @five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}

$counter = 0;
undef @unmap;
undef @five;
undef @three;
undef @mid;
undef @all;
}

$counter++;
$prev_id = $id;
push @all,$_;
if ($binary[2]==1){
push @unmap,$_;
}
elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){
push @five, $_;
}
elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){
push @three, $_;
}
elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){
push @mid, $_;
}
}

if ($counter == 1){
if (@five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}
}
elsif ($counter == 2 && @five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}

sub dec2bin {
my $str = unpack("B32", pack("N", shift));
return $str;
}

sub bin2dec {
return unpack("N", pack("B32", substr("0" x 32 . shift, -32)));
}
99 changes: 99 additions & 0 deletions bin/generate_cram_csv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash

# generate_cram_csv.sh
# -------------------
# Generate a csv file describing the CRAM folder
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°>
# Author = yy5
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°>

# NOTE: chunk_size is the number of containers per chunk (not records/reads)

# Function to process chunking of a CRAM file

chunk_cram() {
local cram=$1
local chunkn=$2
local outcsv=$3
local crai=$4
local chunk_size=$5

local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
local ncontainers=$(zcat "${realcrai}" | wc -l)
local base=$(basename "${realcram}" .cram)

if [ $chunk_size -gt $ncontainers ]; then
chunk_size=$((ncontainers - 1))
fi
local from=0
local to=$((chunk_size - 1))

while [ $to -lt $ncontainers ]; do
#echo "chunk $chunkn: $from - $to"
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
from=$((to + 1))
to=$((to + chunk_size))
((chunkn++))
done

# Catch any remainder
if [ $from -lt $ncontainers ]; then
to=$((ncontainers - 1))
#echo "chunk $chunkn: $from - $to"
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
((chunkn++))
fi
}

# Function to process a CRAM file
process_cram_file() {
local cram=$1
local chunkn=$2
local outcsv=$3
local crai=$4
local chunk_size=$5

local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
local num_read_groups=$(echo "$read_groups" | wc -w)
if [ "$num_read_groups" -gt 1 ]; then
# Multiple read groups: process each separately
for rg in $read_groups; do
local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
samtools view -h -r "$rg" -o "$output_cram" "$cram"
#chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
done
else
# Single read group or no read groups
#chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
fi
}

# /\_/\ /\_/\
# ( o.o ) main ( o.o )
# > ^ < > ^ <

# Check if cram_path is provided
if [ $# -lt 3 ]; then
echo "Usage: $0 <cram_path> <output_csv> <crai_file> <chunk_size>"
exit 1
fi

cram=$1
outcsv=$2
crai=$3
if [ -z "$4" ]; then
chunk_size=10000
else
chunk_size=$4
fi
chunkn=0

# Operates on a single CRAM file
realcram=$(readlink -f $cram)
realcrai=$(readlink -f $crai)
if [ -f "$outcsv" ]; then
rm "$outcsv"
fi
process_cram_file $realcram $chunkn $outcsv $realcrai $chunk_size
10 changes: 10 additions & 0 deletions bin/grep_pg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

# grep_pg.sh
# -------------------
# A shell script to exclude pg lines and label read 1 and read 2 from cram containers
#
# -------------------
# Author = yy5

grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}'
21 changes: 21 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ process {
time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
}

withName: SAMTOOLS_INDEX {
cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) }
time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
}

withName: BLAST_BLASTN {
time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
Expand Down Expand Up @@ -88,6 +94,21 @@ process {
time = { check_max( 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
}

withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT {
cpus = { check_max( 16 * 1 , 'cpus' ) }
memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
}

withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
cpus = { check_max( 16 * 1 , 'cpus' ) }
memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
}

withName: MINIMAP2_INDEX {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) }
}

withName: CRUMBLE {
// No correlation between memory usage and the number of reads or the genome size.
// Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.
Expand Down
61 changes: 53 additions & 8 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,6 @@ process {
ext.args = '-F 0x200 -nt'
}

withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
ext.args = { "-5SPCp -R ${meta.read_group}" }
}

withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
ext.args = { "-R ${meta.read_group}" }
}

withName: SAMTOOLS_MERGE {
beforeScript = { "export REF_PATH=spoof"}
ext.args = { "-c -p" }
Expand All @@ -50,6 +42,54 @@ process {
ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index"
}

withName: CONVERT_CRAM {
ext.args = "--output-fmt cram"
}

withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
ext.args = { "-5SPCp -R ${meta.read_group}" }
}

withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
ext.args = { "-p -R ${meta.read_group}" }
}

withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-p -R '${rglines}'" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-ax sr" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-5SPCp -R '${rglines}'" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "" }
ext.args2 = { "-ax sr" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: "MINIMAP2_INDEX" {
ext.args = { "${fasta.size() > 2.5e9 ? (" -I " + Math.ceil(fasta.size()/1e9)+"G") : ""} "}
}

// minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp.
// In fact, we can also use -I to *decrease* the memory requirements for smaller genomes
// NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp
Expand Down Expand Up @@ -86,6 +126,11 @@ process {
ext.prefix = { "${bam.baseName}" }
}

withName: SAMTOOLS_ADDREPLACERG {
ext.prefix = { "${input.baseName}_addRG" }
ext.args = { "-r ${meta.read_group} --no-PG" }
}

withName: SAMTOOLS_STATS {
beforeScript = { "export REF_PATH=spoof"}
ext.prefix = { "${input.baseName}" }
Expand Down
3 changes: 3 additions & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ params {
// Output directory
outdir = "${projectDir}/results"

// Aligner
short_aligner = "minimap2"

// Fasta references
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
header = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.header.sam"
Expand Down
10 changes: 10 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
"git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306",
"installed_by": ["modules"]
},
"minimap2/index": {
"branch": "master",
"git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5",
"installed_by": ["modules"]
},
"samtools/faidx": {
"branch": "master",
"git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
Expand All @@ -60,6 +65,11 @@
"git_sha": "46eca555142d6e597729fcb682adcc791796f514",
"installed_by": ["modules"]
},
"samtools/index": {
"branch": "master",
"git_sha": "46eca555142d6e597729fcb682adcc791796f514",
"installed_by": ["modules"]
},
"samtools/merge": {
"branch": "master",
"git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
Expand Down
Loading

0 comments on commit 51a1d9f

Please sign in to comment.