Merge pull request #113 from reichan1998/cram_handling

Implement cram chunking and minimap2-based Hi-C alignments
sanger-tol · Sep 17, 2024 · 51a1d9f · 51a1d9f
2 parents 6c6505d + c93c6bc
commit 51a1d9f
Show file tree

Hide file tree

Showing 33 changed files with 1,508 additions and 34 deletions.
diff --git a/bin/awk_filter_reads.sh b/bin/awk_filter_reads.sh
@@ -0,0 +1 @@
+awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
diff --git a/bin/filter_five_end.pl b/bin/filter_five_end.pl
@@ -0,0 +1,109 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my $prev_id = "";
+my @five;
+my @three;
+my @unmap;
+my @mid;
+my @all;
+my $counter = 0;
+
+while (<STDIN>){
+    chomp;
+    if (/^@/){
+        print $_."\n";
+        next;
+    }
+    my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/;
+    my $bin = reverse(dec2bin($flag));
+    my @binary = split(//,$bin);
+    if ($prev_id ne $id && $prev_id ne ""){
+        if ($counter == 1){
+            if (@five == 1){
+                print $five[0]."\n";
+            }
+            else{
+                my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
+                my $bin_1 = reverse(dec2bin($flag_1));
+                my @binary_1 = split(//,$bin_1);
+                $binary_1[2] = 1;
+                my $bin_1_new = reverse(join("",@binary_1));
+                my $flag_1_new =  bin2dec($bin_1_new);
+                print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
+            }
+        }
+        elsif ($counter == 2 && @five == 1){
+            print $five[0]."\n";
+        }
+        else{
+            my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
+            my $bin_1 = reverse(dec2bin($flag_1));
+            my @binary_1 = split(//,$bin_1);
+            $binary_1[2] = 1;
+            my $bin_1_new = reverse(join("",@binary_1));
+            my $flag_1_new =  bin2dec($bin_1_new);
+            print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
+        }
+
+        $counter = 0;
+        undef @unmap;
+        undef @five;
+        undef @three;
+        undef @mid;
+        undef @all;
+    }
+
+    $counter++;
+    $prev_id = $id;
+    push @all,$_;
+    if ($binary[2]==1){
+        push @unmap,$_;
+    }
+    elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){
+        push @five, $_;
+    }
+    elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){
+        push @three, $_;
+    }
+    elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){
+        push @mid, $_;
+    }
+}
+
+if ($counter == 1){
+    if (@five == 1){
+        print $five[0]."\n";
+    }
+    else{
+        my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
+        my $bin_1 = reverse(dec2bin($flag_1));
+        my @binary_1 = split(//,$bin_1);
+        $binary_1[2] = 1;
+        my $bin_1_new = reverse(join("",@binary_1));
+        my $flag_1_new =  bin2dec($bin_1_new);
+        print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
+    }
+}
+elsif ($counter == 2 && @five == 1){
+    print $five[0]."\n";
+}
+else{
+    my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
+    my $bin_1 = reverse(dec2bin($flag_1));
+    my @binary_1 = split(//,$bin_1);
+    $binary_1[2] = 1;
+    my $bin_1_new = reverse(join("",@binary_1));
+    my $flag_1_new =  bin2dec($bin_1_new);
+    print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
+}
+
+sub dec2bin {
+    my $str = unpack("B32", pack("N", shift));
+    return $str;
+}
+
+sub bin2dec {
+    return unpack("N", pack("B32", substr("0" x 32 . shift, -32)));
+}
diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# generate_cram_csv.sh
+# -------------------
+# Generate a csv file describing the CRAM folder
+# ><((((°>    Y    ><((((°>    U     ><((((°>    M     ><((((°>     I     ><((((°>
+# Author = yy5
+# ><((((°>    Y    ><((((°>    U     ><((((°>    M     ><((((°>     I     ><((((°>
+
+# NOTE: chunk_size is the number of containers per chunk (not records/reads)
+
+# Function to process chunking of a CRAM file
+
+chunk_cram() {
+    local cram=$1
+    local chunkn=$2
+    local outcsv=$3
+    local crai=$4
+    local chunk_size=$5
+
+    local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
+    local ncontainers=$(zcat "${realcrai}" | wc -l)
+    local base=$(basename "${realcram}" .cram)
+
+    if [ $chunk_size -gt $ncontainers ]; then
+        chunk_size=$((ncontainers - 1))
+    fi
+    local from=0
+    local to=$((chunk_size - 1))
+
+    while [ $to -lt $ncontainers ]; do
+        #echo "chunk $chunkn: $from - $to"
+        echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
+        from=$((to + 1))
+        to=$((to + chunk_size))
+        ((chunkn++))
+    done
+
+    # Catch any remainder
+    if [ $from -lt $ncontainers ]; then
+        to=$((ncontainers - 1))
+        #echo "chunk $chunkn: $from - $to"
+        echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
+        ((chunkn++))
+    fi
+}
+
+# Function to process a CRAM file
+process_cram_file() {
+    local cram=$1
+    local chunkn=$2
+    local outcsv=$3
+    local crai=$4
+    local chunk_size=$5
+
+    local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
+    local num_read_groups=$(echo "$read_groups" | wc -w)
+    if [ "$num_read_groups" -gt 1 ]; then
+        # Multiple read groups: process each separately
+        for rg in $read_groups; do
+            local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
+            samtools view -h -r "$rg" -o "$output_cram" "$cram"
+            #chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
+            chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
+        done
+    else
+        # Single read group or no read groups
+        #chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
+        chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
+    fi
+}
+
+#  /\_/\        /\_/\
+# ( o.o ) main ( o.o )
+#  > ^ <        > ^ <
+
+# Check if cram_path is provided
+if [ $# -lt 3 ]; then
+    echo "Usage: $0 <cram_path> <output_csv> <crai_file> <chunk_size>"
+    exit 1
+fi
+
+cram=$1
+outcsv=$2
+crai=$3
+if [ -z "$4" ]; then
+    chunk_size=10000
+else
+    chunk_size=$4
+fi
+chunkn=0
+
+# Operates on a single CRAM file
+realcram=$(readlink -f $cram)
+realcrai=$(readlink -f $crai)
+if [ -f "$outcsv" ]; then
+    rm "$outcsv"
+fi
+process_cram_file $realcram $chunkn $outcsv $realcrai $chunk_size
diff --git a/bin/grep_pg.sh b/bin/grep_pg.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# grep_pg.sh
+# -------------------
+# A shell script to exclude pg lines and label read 1 and read 2 from cram containers
+#
+# -------------------
+# Author = yy5
+
+grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}'
diff --git a/conf/base.config b/conf/base.config
@@ -52,6 +52,12 @@ process {
         time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
     }
 
+    withName: SAMTOOLS_INDEX {
+        cpus   = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
+        memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) }
+        time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
+    }
+
     withName: BLAST_BLASTN {
         time   = { check_max(          2.hour  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
         memory = { check_max( 100.MB + 20.MB   * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
@@ -88,6 +94,21 @@ process {
         time   = { check_max(        1.h  * Math.ceil( meta.read_count   / 1000000   ) * task.attempt, 'time'   ) }
     }
 
+    withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT {
+        cpus    = { check_max( 16       * 1                                                            , 'cpus'   ) }
+        memory  = { check_max( 1.GB     * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
+    }
+
+    withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
+        cpus    = { check_max( 16       * 1                                                 , 'cpus'   ) }
+        memory  = { check_max( 1.GB     * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
+    }
+
+    withName: MINIMAP2_INDEX {
+        cpus    = { check_max( 2    * task.attempt, 'cpus'    ) }
+        memory  = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) }
+    }
+
     withName: CRUMBLE {
         // No correlation between memory usage and the number of reads or the genome size.
         // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.

diff --git a/conf/modules.config b/conf/modules.config
@@ -16,14 +16,6 @@ process {
         ext.args = '-F 0x200 -nt'
     }
 
-    withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
-        ext.args = { "-5SPCp -R ${meta.read_group}" }
-    }
-
-    withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
-        ext.args = { "-R ${meta.read_group}" }
-    }
-
     withName: SAMTOOLS_MERGE {
         beforeScript = { "export REF_PATH=spoof"}
         ext.args = { "-c -p" }
@@ -50,6 +42,54 @@ process {
         ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index"
     }
 
+    withName: CONVERT_CRAM {
+        ext.args = "--output-fmt cram"
+    }
+
+    withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
+        ext.args = { "-5SPCp -R ${meta.read_group}" }
+    }
+
+    withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
+        ext.args = { "-p -R ${meta.read_group}" }
+    }
+
+    withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "-F 0x200 -nt" }
+        ext.args2       = { "-p -R '${rglines}'" }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "-F 0x200 -nt" }
+        ext.args2       = { "-ax sr" }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "-F 0x200 -nt" }
+        ext.args2       = { "-5SPCp -R '${rglines}'" }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
+        ext.args        = ""
+        ext.args1       = { "" }
+        ext.args2       = { "-ax sr" }
+        ext.args3       = "-mpu"
+        ext.args4       = { "--write-index -l1" }
+    }
+
+    withName: "MINIMAP2_INDEX" {
+        ext.args        = { "${fasta.size() > 2.5e9 ? (" -I " + Math.ceil(fasta.size()/1e9)+"G") : ""} "}
+    }
+
         // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp.
         // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes
         // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp
@@ -86,6 +126,11 @@ process {
         ext.prefix = { "${bam.baseName}" }
     }
 
+    withName: SAMTOOLS_ADDREPLACERG {
+        ext.prefix = { "${input.baseName}_addRG" }
+        ext.args = { "-r ${meta.read_group} --no-PG" }
+    }
+
     withName: SAMTOOLS_STATS {
         beforeScript = { "export REF_PATH=spoof"}
         ext.prefix = { "${input.baseName}" }

diff --git a/conf/test.config b/conf/test.config
@@ -25,6 +25,9 @@ params {
     // Output directory
     outdir = "${projectDir}/results"
 
+    // Aligner
+    short_aligner = "minimap2"
+
     // Fasta references
     fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
     header = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.header.sam"

diff --git a/modules.json b/modules.json
@@ -40,6 +40,11 @@
                         "git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306",
                         "installed_by": ["modules"]
                     },
+                    "minimap2/index": {
+                        "branch": "master",
+                        "git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/faidx": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
@@ -60,6 +65,11 @@
                         "git_sha": "46eca555142d6e597729fcb682adcc791796f514",
                         "installed_by": ["modules"]
                     },
+                    "samtools/index": {
+                        "branch": "master",
+                        "git_sha": "46eca555142d6e597729fcb682adcc791796f514",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/merge": {
                         "branch": "master",
                         "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",