-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #113 from reichan1998/cram_handling
Implement cram chunking and minimap2-based Hi-C alignments
- Loading branch information
Showing
33 changed files
with
1,508 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#!/usr/bin/perl | ||
use strict; | ||
use warnings; | ||
|
||
my $prev_id = ""; | ||
my @five; | ||
my @three; | ||
my @unmap; | ||
my @mid; | ||
my @all; | ||
my $counter = 0; | ||
|
||
while (<STDIN>){ | ||
chomp; | ||
if (/^@/){ | ||
print $_."\n"; | ||
next; | ||
} | ||
my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/; | ||
my $bin = reverse(dec2bin($flag)); | ||
my @binary = split(//,$bin); | ||
if ($prev_id ne $id && $prev_id ne ""){ | ||
if ($counter == 1){ | ||
if (@five == 1){ | ||
print $five[0]."\n"; | ||
} | ||
else{ | ||
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; | ||
my $bin_1 = reverse(dec2bin($flag_1)); | ||
my @binary_1 = split(//,$bin_1); | ||
$binary_1[2] = 1; | ||
my $bin_1_new = reverse(join("",@binary_1)); | ||
my $flag_1_new = bin2dec($bin_1_new); | ||
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); | ||
} | ||
} | ||
elsif ($counter == 2 && @five == 1){ | ||
print $five[0]."\n"; | ||
} | ||
else{ | ||
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; | ||
my $bin_1 = reverse(dec2bin($flag_1)); | ||
my @binary_1 = split(//,$bin_1); | ||
$binary_1[2] = 1; | ||
my $bin_1_new = reverse(join("",@binary_1)); | ||
my $flag_1_new = bin2dec($bin_1_new); | ||
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); | ||
} | ||
|
||
$counter = 0; | ||
undef @unmap; | ||
undef @five; | ||
undef @three; | ||
undef @mid; | ||
undef @all; | ||
} | ||
|
||
$counter++; | ||
$prev_id = $id; | ||
push @all,$_; | ||
if ($binary[2]==1){ | ||
push @unmap,$_; | ||
} | ||
elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){ | ||
push @five, $_; | ||
} | ||
elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){ | ||
push @three, $_; | ||
} | ||
elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){ | ||
push @mid, $_; | ||
} | ||
} | ||
|
||
if ($counter == 1){ | ||
if (@five == 1){ | ||
print $five[0]."\n"; | ||
} | ||
else{ | ||
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; | ||
my $bin_1 = reverse(dec2bin($flag_1)); | ||
my @binary_1 = split(//,$bin_1); | ||
$binary_1[2] = 1; | ||
my $bin_1_new = reverse(join("",@binary_1)); | ||
my $flag_1_new = bin2dec($bin_1_new); | ||
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); | ||
} | ||
} | ||
elsif ($counter == 2 && @five == 1){ | ||
print $five[0]."\n"; | ||
} | ||
else{ | ||
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; | ||
my $bin_1 = reverse(dec2bin($flag_1)); | ||
my @binary_1 = split(//,$bin_1); | ||
$binary_1[2] = 1; | ||
my $bin_1_new = reverse(join("",@binary_1)); | ||
my $flag_1_new = bin2dec($bin_1_new); | ||
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); | ||
} | ||
|
||
sub dec2bin { | ||
my $str = unpack("B32", pack("N", shift)); | ||
return $str; | ||
} | ||
|
||
sub bin2dec { | ||
return unpack("N", pack("B32", substr("0" x 32 . shift, -32))); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#!/bin/bash | ||
|
||
# generate_cram_csv.sh | ||
# ------------------- | ||
# Generate a csv file describing the CRAM folder | ||
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°> | ||
# Author = yy5 | ||
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°> | ||
|
||
# NOTE: chunk_size is the number of containers per chunk (not records/reads) | ||
|
||
# Function to process chunking of a CRAM file | ||
|
||
chunk_cram() { | ||
local cram=$1 | ||
local chunkn=$2 | ||
local outcsv=$3 | ||
local crai=$4 | ||
local chunk_size=$5 | ||
|
||
local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g") | ||
local ncontainers=$(zcat "${realcrai}" | wc -l) | ||
local base=$(basename "${realcram}" .cram) | ||
|
||
if [ $chunk_size -gt $ncontainers ]; then | ||
chunk_size=$((ncontainers - 1)) | ||
fi | ||
local from=0 | ||
local to=$((chunk_size - 1)) | ||
|
||
while [ $to -lt $ncontainers ]; do | ||
#echo "chunk $chunkn: $from - $to" | ||
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv | ||
from=$((to + 1)) | ||
to=$((to + chunk_size)) | ||
((chunkn++)) | ||
done | ||
|
||
# Catch any remainder | ||
if [ $from -lt $ncontainers ]; then | ||
to=$((ncontainers - 1)) | ||
#echo "chunk $chunkn: $from - $to" | ||
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv | ||
((chunkn++)) | ||
fi | ||
} | ||
|
||
# Function to process a CRAM file | ||
process_cram_file() { | ||
local cram=$1 | ||
local chunkn=$2 | ||
local outcsv=$3 | ||
local crai=$4 | ||
local chunk_size=$5 | ||
|
||
local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}') | ||
local num_read_groups=$(echo "$read_groups" | wc -w) | ||
if [ "$num_read_groups" -gt 1 ]; then | ||
# Multiple read groups: process each separately | ||
for rg in $read_groups; do | ||
local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram" | ||
samtools view -h -r "$rg" -o "$output_cram" "$cram" | ||
#chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size") | ||
chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size" | ||
done | ||
else | ||
# Single read group or no read groups | ||
#chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size") | ||
chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size" | ||
fi | ||
} | ||
|
||
# /\_/\ /\_/\ | ||
# ( o.o ) main ( o.o ) | ||
# > ^ < > ^ < | ||
|
||
# Check if cram_path is provided | ||
if [ $# -lt 3 ]; then | ||
echo "Usage: $0 <cram_path> <output_csv> <crai_file> <chunk_size>" | ||
exit 1 | ||
fi | ||
|
||
cram=$1 | ||
outcsv=$2 | ||
crai=$3 | ||
if [ -z "$4" ]; then | ||
chunk_size=10000 | ||
else | ||
chunk_size=$4 | ||
fi | ||
chunkn=0 | ||
|
||
# Operates on a single CRAM file | ||
realcram=$(readlink -f $cram) | ||
realcrai=$(readlink -f $crai) | ||
if [ -f "$outcsv" ]; then | ||
rm "$outcsv" | ||
fi | ||
process_cram_file $realcram $chunkn $outcsv $realcrai $chunk_size |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/bash | ||
|
||
# grep_pg.sh | ||
# ------------------- | ||
# A shell script to exclude pg lines and label read 1 and read 2 from cram containers | ||
# | ||
# ------------------- | ||
# Author = yy5 | ||
|
||
grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.