Skip to content

Commit

Permalink
Functionality to detect alignment bias
Browse files Browse the repository at this point in the history
  • Loading branch information
sposadac committed Jun 5, 2020
1 parent 4a3e1ae commit a0b2ba3
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 1 deletion.
1 change: 1 addition & 0 deletions envs/test_snv.yaml → envs/testbench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ dependencies:
- sh
- biopython
- scipy
- pysam
- mafft=7.305
11 changes: 10 additions & 1 deletion rules/benchmark.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class VpipeBenchConfig(VpipeConfig):
'simBench': __RECORD__(
value=f"{VPIPE_BASEDIR}/scripts/simBench.py", type=str),
'art': __RECORD__(value="art_illumina", type=str),
'alignmentBias': __RECORD__(
value=f"{VPIPE_BASEDIR}/scripts/alignmentBias.py", type=str),
'testBench': __RECORD__(
value=f"{VPIPE_BASEDIR}/scripts/testBench.py", type=str),
'alignmentIntervals' : __RECORD__(
Expand Down Expand Up @@ -81,10 +83,17 @@ class VpipeBenchConfig(VpipeConfig):
'mem': __RECORD__(value=5000, type=int),
'time': __RECORD__(value=90, type=int),
}),
('alignment_bias', {
'mem': __RECORD__(value=2000, type=int),
'time': __RECORD__(value=60, type=int),
'conda': __RECORD__(
value=f'{VPIPE_BASEDIR}/envs/testbench.yaml', type=str),
}),
('test_snv', {
'mem': __RECORD__(value=2000, type=int),
'time': __RECORD__(value=60, type=int),
'conda': __RECORD__(value='', type=str),
'conda': __RECORD__(
value=f'{VPIPE_BASEDIR}/envs/testbench.yaml', type=str),

're_msa': __RECORD__(value=False, type=bool),
}),
Expand Down
45 changes: 45 additions & 0 deletions rules/testbench.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,51 @@ def snvfiles(wildcards):
sys.exit(1)
return snv_files

rule alignment_bias:
input:
REF = reference_file,
BAM = "{sample_dir}/{sample_name}/{date}/alignments/REF_aln.bam",
R1gz = "{sample_dir}/{sample_name}/{date}/preprocessed_data/R1.fastq.gz",
HAPLOTYPE_SEQS = "{sample_dir}/{sample_name}/{date}/references/haplotypes/haplotypes.fasta",
output:
"{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.tsv"
params:
scratch = '2000',
mem = config.alignment_bias['mem'],
time = config.alignment_bias['time'],
PAIRED = '-p' if config.input['paired'] else '',
ID = lambda wildcards: f'{wildcards.sample_name}-{wildcards.date}',
ALIGNMENT_BIAS = config.applications['alignmentBias'],
log:
outfile = "{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.out.log",
errfile = "{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.out.log"
conda:
config.alignment_bias['conda']
threads:
1
shell:
"""
{params.ALIGNMENT_BIAS} -r {input.REF} -b {input.BAM} -f <(zcat {input.R1gz}) --hap {input.HAPLOTYPE_SEQS} {params.PAIRED} -N {params.ID} -o {output}
"""

rule aggregate_alignment_bias:
input:
expand("{dataset}/alignments/alignment_bias.tsv", dataset=datasets)
output:
"stats/alignment_bias.tsv"
params:
scratch = '1250',
mem = config.aggregate['mem'],
time = config.aggregate['time']
log:
outfile = "stats/alignment_bias.out.log",
errfile = "stats/alignment_bias.out.log"
shell:
"""
awk FNR!=1 {input} > {output}
sed -i 1i"SampleID\tHaplotypeID\tDivergence\tPercent-aligned\tPercent-bases-aligne\n" {output}
"""


rule aggregate_beforeSB:
input:
Expand Down
135 changes: 135 additions & 0 deletions scripts/alignmentBias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env python3

import argparse
import pysam
from Bio import SeqIO, pairwise2

__author__ = "Susana Posada-Cespedes"
__license__ = "Apache2.0"
__maintainer__ = "Ivan Topolsky"
__email__ = "[email protected]"


def parse_args():
""" Set up the parsing of command-line arguments """

parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

requiredNamed = parser.add_argument_group('required named arguments')
requiredNamed.add_argument(
"-r", required=True, default=None, metavar='FASTA',
dest='reference_file', type=str, help="Referene sequence"
)
requiredNamed.add_argument(
"-b", required=True, default=None, metavar='BAM', dest='bamfile',
help="BAM file"
)
requiredNamed.add_argument(
"-f", required=True, default=None, metavar='FASTQ', dest='fastq_file',
help="FASTQ file containing reads after QC"
)
requiredNamed.add_argument(
"--hap", required=True, default=None, metavar='FASTA',
dest='haplotype_seqs', help="FASTA file containing haplotype sequences"
)
requiredNamed.add_argument(
"-N", required=False, default='sample', metavar='STR',
dest='sampleID', help="Patient/sample identifiers"
)
parser.add_argument(
"-p", required=False, default=False, action='store_true',
dest='paired', help="Indicate whether to simulate paired-end reads"
)
parser.add_argument(
"-o", required=False, default='alignment_bias.tsv', metavar='FILENAME',
dest='output_file', type=str,
help="Output file"
)

return parser.parse_args()


def hamming_dist(s1, s2):
"""Return the Hamming distance between sequences"""
assert len(s1) == len(s2), ("Hamming distance is undefined for sequences "
"differing in their lengths")
return sum(el1 != el2 for el1, el2 in zip(s1, s2))


def main():
args = parse_args()
reference = SeqIO.read(args.reference_file, "fasta")
reference_len = len(reference)
start = None
end = None

hap_cnt = {}
hap_coverage = {}
with pysam.AlignmentFile(args.bamfile, 'rb') as alnfile:
for read in alnfile.fetch(reference=reference.id, start=start, end=end):
query_name = read.query_name
hapID = query_name.split('-')[0]
if hapID in hap_cnt:
hap_cnt[hapID] += 1
else:
hap_cnt[hapID] = 1

# Get the percent of aligned bases per read
cigar_arr = read.get_cigar_stats()[0]
aux = (cigar_arr[0] + cigar_arr[1])
assert aux == read.query_alignment_length
percent_aligned = read.query_alignment_length / read.infer_read_length()
assert percent_aligned <= 1, f"{cigar_arr}"
if hapID in hap_coverage:
hap_coverage[hapID] += percent_aligned
else:
hap_coverage[hapID] = percent_aligned

for k, v in hap_cnt.items():
aux = hap_coverage[k] / v
assert aux <= 1, f"{aux}"
hap_coverage[k] = aux

# Parse FASTQ file to compute the expected number of reads per haplotype
hap_expected = {}
total = 0
for read in SeqIO.parse(args.fastq_file, "fastq"):
total += 1
hapID = read.id.split('-')[0]
if hapID in hap_expected:
hap_expected[hapID] += 1
else:
hap_expected[hapID] = 1

percent_aligned = {}
for k, v in hap_expected.items():
if k in hap_cnt:
if args.paired:
percent_aligned[k] = hap_cnt[k] / (v * 2)
else:
percent_aligned[k] = hap_cnt[k] / v
else:
percent_aligned[k] = 0
hap_coverage[k] = 0

# Compute divergence from the reference sequence
divergence = {}
affine_pen = pairwise2.affine_penalty(-1, -0.1, True)
with open(args.haplotype_seqs, 'r') as infile:
for record in SeqIO.parse(infile, 'fasta'):
alignment = pairwise2.align.globalmc(reference.seq, record.seq, 1, 0,
affine_pen, affine_pen,
one_alignment_only=True)
divergence[record.id] = hamming_dist(alignment[0][0],
alignment[0][1]) / alignment[0][4]

with open(args.output_file, 'w') as outfile:
outfile.write("SampleID\tHaplotypeID\tDivergence\tPercent-aligned\t"
"Percent-bases-aligned\n")
for k, v in percent_aligned.items():
outfile.write(f"{args.sampleID}\t{k}\t{divergence[k]}\t{v}\t"
f"{hap_coverage[k]}\n")

if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions vpipeBench.snake
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rule allbench:
input:
RESULTS,
"variants/SNV_calling_performance.tsv",
"stats/alignment_bias.tsv",
"stats/coverage_intervals.tsv",
"stats/read_counts.tsv"

Expand Down

0 comments on commit a0b2ba3

Please sign in to comment.