From e4f15a48e91ce9e9a8dcd4f82f5fcbce0d65884c Mon Sep 17 00:00:00 2001 From: liuxh Date: Sun, 27 Aug 2023 20:29:13 +0100 Subject: [PATCH 1/2] refactor code structure --- main.py | 256 ++++++------- ...omeChronicler_afogeno_generator_fromBAM.py | 193 ---------- ...omeChronicler_afogeno_generator_fromVCF.py | 156 -------- ...meChronicler_ancestry_generator_fromBAM.py | 299 --------------- ...meChronicler_ancestry_generator_fromVCF.py | 229 ------------ ...Chronicler_quickFilterFinalReportTables.py | 33 -- scripts/__init__.py | 0 scripts/afogeno_generator.py | 193 ++++++++++ scripts/ancestry_generator.py | 350 ++++++++++++++++++ ...mAfoGeno.py => genotables_from_afogeno.py} | 138 +++++-- ...omeChronicler_XLSX_fromTables.py => io.py} | 110 +++++- ...cestry.R => plot_generator_fromAncestry.R} | 10 +- ...stry.py => plot_generator_fromAncestry.py} | 4 +- scripts/utils.py | 65 ++++ ...bles_fromVEP.py => vep_tables_from_vep.py} | 38 +- 15 files changed, 940 insertions(+), 1134 deletions(-) delete mode 100755 scripts/GenomeChronicler_afogeno_generator_fromBAM.py delete mode 100755 scripts/GenomeChronicler_afogeno_generator_fromVCF.py delete mode 100755 scripts/GenomeChronicler_ancestry_generator_fromBAM.py delete mode 100755 scripts/GenomeChronicler_ancestry_generator_fromVCF.py delete mode 100755 scripts/GenomeChronicler_quickFilterFinalReportTables.py create mode 100644 scripts/__init__.py create mode 100755 scripts/afogeno_generator.py create mode 100755 scripts/ancestry_generator.py rename scripts/{GenomeChronicler_genoTables_fromAfoGeno.py => genotables_from_afogeno.py} (80%) rename scripts/{GenomeChronicler_XLSX_fromTables.py => io.py} (51%) rename scripts/{GenomeChronicler_plot_generator_fromAncestry.R => plot_generator_fromAncestry.R} (97%) rename scripts/{GenomeChronicler_plot_generator_fromAncestry.py => plot_generator_fromAncestry.py} (99%) create mode 100644 scripts/utils.py rename scripts/{GenomeChronicler_vepTables_fromVEP.py => vep_tables_from_vep.py} (80%) diff --git a/main.py b/main.py index 5bffa1d..7098656 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,27 @@ #!/usr/bin/env python3 import argparse -import re +import shutil import subprocess import sys from datetime import datetime from pathlib import Path -import shutil -import time -import os -from tqdm import tqdm +from scripts.afogeno_generator import generate_afogeno_pipeline +from scripts.ancestry_generator import generate_ancestry_pipeline +from scripts.genotables_from_afogeno import geno_tables_from_afogeno +from scripts.io import render_latex, export_genotypes_xlsx_from_csvs +from scripts.plot_generator_fromAncestry import plot_generator_from_ancestry +from scripts.utils import clean_bam_file_noCHR +from scripts.vep_tables_from_vep import get_vep_tables_from_vep + +def check_path_exist(path, file_type, + error_template="--- ERROR: The {file_type} specified in the command line wasn't found [{" + "path}], please check the provided path and try again ---\n", + exit_code=404): + if path and not Path(path).exists(): + print(error_template.format(file_type=file_type, path=path), file=sys.stderr) + sys.exit(exit_code) def print_header_ascii(): @@ -28,16 +39,6 @@ def print_header_ascii(): """) -def cleanBAMfile_noCHR(filename, verbose=False): - # Reheader the BAM file and create an index - subprocess.run( - f"samtools reheader -c 'perl -pe \"s/^(@SQ.*)(\tSN:)chr/\$1\$2/\"' {filename} > {filename}.clean.BAM", - shell=True) - - print(f"\t +++ INFO: Indexing the BAM file") - subprocess.run(f"samtools index -@ 6 {filename}.clean.BAM", shell=True) - - def get_parser(): parser = argparse.ArgumentParser(description="GenomeChronicler") parser.add_argument('-d', '--debug', action='store_true', help='Debug flag') @@ -83,7 +84,7 @@ def get_parser(): parser.add_argument('--verbose', dest='verbose', type=bool, default=False) return parser -def main_druid(): +def main(): import os ### Processing Needed steps ### @@ -94,9 +95,6 @@ def main_druid(): dir = "/GenomeChronicler/" resultsdir = os.getcwd() - template_withVEP = f"{dir}templates/reportTemplate_withVEP.tex" - template_ohneVEP = f"{dir}templates/reportTemplate_ohneVEP.tex" - template = template_ohneVEP # Defining input options and their default values... parser = get_parser() @@ -124,15 +122,6 @@ def main_druid(): start_time = datetime.now() ##################### check file existence and proceed - - def check_path_exist(path, file_type, - error_template="--- ERROR: The {file_type} specified in the command line wasn't found [{" - "path}], please check the provided path and try again ---\n", - exit_code=404): - if path and not Path(path).exists(): - print(error_template.format(file_type=file_type, path=path), file=sys.stderr) - sys.exit(exit_code) - if BAM_file: check_path_exist(BAM_file, "BAM file", exit_code=404) elif gVCF_file: @@ -151,7 +140,6 @@ def check_path_exist(path, file_type, check_path_exist(templateParam, "LaTeX Template file", exit_code=501) sample = None - import os if BAM_file: sample = Path(BAM_file).name.split('.')[0] # sample = os.path.splitext(os.path.basename(BAM_file))[0] @@ -169,153 +157,125 @@ def check_path_exist(path, file_type, file=sys.stderr) sys.exit(555) - output_sample_dir = f"{resultsdir}/results/results_{sample}" - temp_dir = f"{output_sample_dir}/temp" - - os.system(f"mkdir -p {temp_dir}") - check_path_exist(temp_dir, "Results directory", exit_code=102, - error_template="\t --- ERROR: Results directory [{path}] can't be found and could not be " - "created. Please check permissions and try again ---\n") - - LOGFILE2 = f"{output_sample_dir}/{sample}.processingLog.stderr.txt" + output_dir = Path(resultsdir) + temp_dir = output_dir/"temp" + temp_dir.mkdir(parents=True, exist_ok=True) - # Dump parameters - if debugFlag: - print(f"SCRIPT = {scriptName}", file=sys.stderr) - print(f"DEBUG = {debugFlag}", file=sys.stderr) - print(f"BAM = {BAM_file}", file=sys.stderr) - print(f"VCF = {gVCF_file}", file=sys.stderr) - print(f"VEP = {VEP_file}\n", file=sys.stderr) - print(f"GATKthreads = {GATKthreads}\n\n", file=sys.stderr) + # os.system(f"mkdir -p {temp_dir}") + # check_path_exist(temp_dir, "Results directory", exit_code=102, + # error_template="\t --- ERROR: Results directory [{path}] can't be found and could not be " + # "created. Please check permissions and try again ---\n") - print(f"TEMPLATE = {template}", file=sys.stderr) - print(f"SAMPLE = {sample}", file=sys.stderr) - print(f"DIR = {dir}", file=sys.stderr) - print(f"TMPDIR = {temp_dir}\n", file=sys.stderr) - # print(f"LOGFILE1 = {LOGFILE1}\n", file=sys.stderr) - print(f"LOGFILE2 = {LOGFILE2}\n\n", file=sys.stderr) - - sys.exit(0) + LOGFILE2 = f"{output_dir}/{sample}.processingLog.stderr.txt" # Main bit of code print_header_ascii() - print(f"\t +++ INFO: Starting Processing at {start_time.strftime('%Y-%m-%d %H:%M:%S')}", file=sys.stderr) print(f"\t +++ INFO: Opening Log File at: {LOGFILE2}", file=sys.stderr) subprocess.run(f"echo > {LOGFILE2}", shell=True) - clean_sample = sample.replace("_", "\\_") - - with open(f"{output_sample_dir}/SampleName.txt", "w") as sample_name_file: - sample_name_file.write(clean_sample) - + output_vep_dir = None if VEP_file: - template = template_withVEP if not templateParam else templateParam - print("\t +++ INFO: Preprocessing VEP file") - cmd = f"python3 {dir}scripts/GenomeChronicler_vepTables_fromVEP.py {VEP_file} {output_sample_dir}/" - subprocess.run(cmd, shell=True) + output_vep_dir = output_dir/"vep_dir" + get_vep_tables_from_vep(VEP_file, output_vep_dir) if BAM_file: print("\t +++ INFO: Preprocessing BAM file") - if not os.path.exists(f"{BAM_file}.clean.BAM") or not os.path.exists(f"{BAM_file}.clean.BAM.bai"): - cleanBAMfile_noCHR(BAM_file, verbose=verbose) - - BAM_file = f"{BAM_file}.clean.BAM" - - print("\t +++ INFO: Generating Ancestry") - cmd = f"python3 {dir}scripts/GenomeChronicler_ancestry_generator_fromBAM.py {BAM_file} {output_sample_dir}/ {GATKthreads} 2>> {LOGFILE2}" - subprocess.run(cmd, shell=True) - - print("\t +++ INFO: Generating Ancestry Plots") - output_path = f"{output_sample_dir}/AncestryPlot.pdf" - if not Path(output_path).exists(): - cmd = f"SAMPLE={sample} ID={sample} DIR={resultsdir} R CMD BATCH {dir}scripts/GenomeChronicler_plot_generator_fromAncestry.R" - subprocess.run(cmd, shell=True) - - print("\t +++ INFO: Generating Genotypes Files") - cmd = f"python3 {dir}scripts/GenomeChronicler_afogeno_generator_fromBAM.py {BAM_file} {output_sample_dir}/ {GATKthreads} 2>> {LOGFILE2}" - subprocess.run(cmd, shell=True) - AFOgeno_file = f"{temp_dir}/{sample}.afogeno38.txt" + output_path = temp_dir/f"{sample}.clean.BAM" + if not output_path.exists(): + clean_bam_file_noCHR(BAM_file, temp_dir) + input_path = output_path elif gVCF_file: print("\t +++ INFO: Preprocessing VCF file") - - print("\t +++ INFO: Generating Ancestry") - if not Path(f"{output_sample_dir}/{sample}.genotypingVCF.vcf.gz").exists(): - cmd = f"python3 {dir}scripts/GenomeChronicler_ancestry_generator_fromVCF.py {gVCF_file} {sample} {output_sample_dir} 2>> {LOGFILE2}" - subprocess.run(cmd, shell=True) - # if not Path(f"{output_sample_dir}/{sample}.genotypingVCF.vcf.gz").exists(): - cmd = f"python3 {dir}scripts/GenomeChronicler_plot_generator_fromAncestry.py {output_sample_dir}/temp/{sample}_1kGP_pruned_pca_20.eigenvec {sample} {output_sample_dir}/" - subprocess.run(cmd, shell=True) - - print("\t +++ INFO: Generating Genotypes Files") - cmd = f"python3 {dir}scripts/GenomeChronicler_afogeno_generator_fromVCF.py {gVCF_file} {sample} {output_sample_dir} 2>> {LOGFILE2}" - subprocess.run(cmd, shell=True) - AFOgeno_file = f"{temp_dir}/{sample}.afogeno38.txt" - + input_path = gVCF_file else: print("\t +++ ERROR: No BAM or VCF file provided. Exiting.") sys.exit() - print("\t +++ INFO: Generating Genome Report Tables") - if not Path(f"{output_sample_dir}/latest.good.reportTable.csv").exists(): - cmd = f"python3 {dir}scripts/GenomeChronicler_genoTables_fromAfoGeno.py {AFOgeno_file} {output_sample_dir}/ 2>> {LOGFILE2}" - subprocess.run(cmd, shell=True) + print("\t +++ INFO: Generating Ancestry") + output_ancestry_dir = output_dir/"ancestry" + output_ancestry_path = output_ancestry_dir/'vcf_ancestry'/'ancestry.rsIDs.gvcf.gz' + if not output_ancestry_path.exists(): + # cmd = f"python3 {dir}scripts/ancestry_generator.py ancestry_generator {BAM_file} {output_dir} {GATKthreads} 2>> {LOGFILE2}" + # subprocess.run(cmd, shell=True) + generate_ancestry_pipeline(input_path, output_ancestry_dir, GATKthreads) + + print("\t +++ INFO: Generating Ancestry Plots") + pca_eigenvec_path = f"{output_ancestry_dir}/pca_ancestry/1kGP_pruned_pca_20.eigenvec" + output_plot_path = f"{output_dir}/AncestryPlot.pdf" + if not Path(output_plot_path).exists(): + # cmd = f"PCA_PATH={pca_eigenvec_path} ID={sample} OUTPUT_DIR={output_plot_path} R CMD BATCH {dir}scripts/plot_generator_fromAncestry.R" + # ret = subprocess.run(cmd, shell=True) + # print(ret) + plot_generator_from_ancestry(pca_eigenvec_path, sample, output_dir) + # exit() + + print("\t +++ INFO: Generating Genotypes Files") + output_afogeno_dir = output_dir/"afogeno" + output_afogeno_path = output_ancestry_dir/'vcf_afogeno'/'afogeno.rsIDs.gvcf.gz' + afogeno_file = output_afogeno_dir/f"afogeno38.txt" + if not afogeno_file.exists(): + # cmd = f"python3 {dir}scripts/ancestry_generator.py ancestry_generator {BAM_file} {output_dir} {GATKthreads} 2>> {LOGFILE2}" + # subprocess.run(cmd, shell=True) + generate_afogeno_pipeline(input_path, output_afogeno_dir, GATKthreads) - print("\t +++ INFO: Filtering Report Tables") - cmd = f"python3 {dir}scripts/GenomeChronicler_quickFilterFinalReportTables.py {output_sample_dir}/latest.good.reportTable.csv" - subprocess.run(cmd, shell=True) - cmd = f"python3 {dir}scripts/GenomeChronicler_quickFilterFinalReportTables.py {output_sample_dir}/latest.bad.reportTable.csv" - subprocess.run(cmd, shell=True) + print("\t +++ INFO: Generating Genome Report Tables") + output_tables_dir = output_dir/"tables" + geno_tables_from_afogeno(afogeno_file, output_tables_dir) + for path in output_tables_dir.glob("*.csv"): + shutil.copy(path, output_dir) print("\t +++ INFO: Combining Excel Tables") - cmd = f"python3 {dir}scripts/GenomeChronicler_XLSX_fromTables.py {output_sample_dir}/ {output_sample_dir}/{sample}genotypes{dtag}.xlsx" - subprocess.run(cmd, shell=True) + output_xlsx_path = output_tables_dir/f"{sample}.genotypes.{dtag}.xlsx" + export_genotypes_xlsx_from_csvs(output_tables_dir, output_xlsx_path) + shutil.copy(output_xlsx_path, output_dir) # copy to main results dir + # render latex template print("\t +++ INFO: Compiling Genome Report") - TEMPLATETEX = f"{sample}_report_{dtag}" - shutil.copy(template, f"{output_sample_dir}/{TEMPLATETEX}.tex") - shutil.copy(f"{dir}templates/versionTable.txt", f"{output_sample_dir}/") - shutil.copy(f"{dir}templates/GeneStructure.pdf", f"{output_sample_dir}/") - - def run_latex(): - cwd = os.getcwd() - os.chdir(f"{resultsdir}/results/results_{sample}") - for _ in range(3): - cmd = f"pdflatex -interaction=nonstopmode {TEMPLATETEX}.tex 2> /dev/null > /dev/null" - subprocess.run(cmd, shell=True) - os.chdir(cwd) - - run_latex() - - if no_clean_temporary_files is False: - print("\t +++ INFO: Cleaning up Temporary and Intermediate Files") - if BAM_file is not None: - os.remove(BAM_file) - os.remove(f"{BAM_file}.bai") - - shutil.rmtree(f"{temp_dir}") - for file in Path(f"{output_sample_dir}/").glob("latest*.csv"): - os.remove(file) - - subprocess.run(f"rm -rf {output_sample_dir}/versionTable.txt", shell=True) - subprocess.run(f"rm -rf {output_sample_dir}/GeneStructure.pdf", shell=True) - subprocess.run(f"rm -rf {output_sample_dir}/{TEMPLATETEX}.out", shell=True) - subprocess.run(f"rm -rf {output_sample_dir}/texput.log", shell=True) - subprocess.run(f"rm -rf {output_sample_dir}/{TEMPLATETEX}.aux", shell=True) - subprocess.run(f"rm -rf {output_sample_dir}/{TEMPLATETEX}.log", shell=True) - subprocess.run(f"rm -rf {output_sample_dir}/{TEMPLATETEX}.tex", shell=True) - subprocess.run(f"rm -rf {dir}GenomeChronicler_plot_generator_fromAncestry.Rout", shell=True) - - time.sleep(1) - if BAM_file is not None: - BAM_file = BAM_file.replace(".clean.BAM", "") - print( - f"\n\t +++ DONE: Finished GenomeChronicler for file [ {BAM_file} ] in {datetime.now() - start_time} seconds") + template_withVEP = f"{dir}templates/reportTemplate_withVEP.tex" + template_ohneVEP = f"{dir}templates/reportTemplate_ohneVEP.tex" + template = template_ohneVEP + if VEP_file: + template = template_withVEP if not templateParam else templateParam + run_latex_dir = output_dir/"latex" + render_latex(output_tables_dir, sample, run_latex_dir, + plot_path=output_plot_path, vep_dir=output_vep_dir, + template_dir="templates", latex_template=template, + ) + latex_pdf_path = run_latex_dir/f"{sample}_report.pdf" + shutil.copy(latex_pdf_path, output_dir) # copy to main results dir + + + # if no_clean_temporary_files is False: + # print("\t +++ INFO: Cleaning up Temporary and Intermediate Files") + # if BAM_file is not None: + # os.remove(BAM_file) + # os.remove(f"{BAM_file}.bai") + # + # shutil.rmtree(f"{temp_dir}") + # for file in Path(f"{output_dir}/").glob("latest*.csv"): + # os.remove(file) + # + # subprocess.run(f"rm -rf {output_dir}/versionTable.txt", shell=True) + # subprocess.run(f"rm -rf {output_dir}/GeneStructure.pdf", shell=True) + # subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.out", shell=True) + # subprocess.run(f"rm -rf {output_dir}/texput.log", shell=True) + # subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.aux", shell=True) + # subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.log", shell=True) + # subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.tex", shell=True) + # subprocess.run(f"rm -rf {dir}GenomeChronicler_plot_generator_fromAncestry.Rout", shell=True) + + # # time.sleep(1) + # if BAM_file is not None: + # BAM_file = BAM_file.replace(".clean.BAM", "") + # print( + # f"\n\t +++ DONE: Finished GenomeChronicler for file [ {BAM_file} ] in {datetime.now() - start_time} seconds") if __name__ == '__main__': - main_druid() + main() + diff --git a/scripts/GenomeChronicler_afogeno_generator_fromBAM.py b/scripts/GenomeChronicler_afogeno_generator_fromBAM.py deleted file mode 100755 index cbbdfe4..0000000 --- a/scripts/GenomeChronicler_afogeno_generator_fromBAM.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python3 -import subprocess -import os -import sys -import gzip -from os.path import basename -from pathlib import Path - -import fire - - -def build_genotype(a1, a2, gen): - alleles = [a1] + a2.split(",") - extras = [""] * len(alleles) - l1 = len(alleles[0]) - extra = "" - for i in range(1, len(alleles)): - extras[i] = "" - l2 = len(alleles[i]) - if l1 > 1 or l2 > 1: - if l2 > l1: - alleles[0] = "-" - alleles[i] = alleles[i][1:] - extras[i] = "I" - else: - alleles[0] = alleles[0][1:] - alleles[i] = "-" - extras[i] = "D" - res = [alleles[int(i)] for i in gen.split("/")] - extra = ";".join([extras[int(i)] for i in gen.split("/")]).strip(";") - return res, extra - - -def uniq(l): - return list(set(l)) - - -def afogeno_generator_from_BAM(input_BAM, resultsdir="", numThreads=4, - dir_reference='reference', dir_software='software', workdir="", - ): - if 'SINGULARITY_NAME' in os.environ: - workdir = "/GenomeChronicler/" - - # resultsdir = workdir - ref_bed = workdir + f"{dir_reference}/snps.19-114.unique.nochr.bed" - gatk = workdir + f"{dir_software}/GenomeAnalysisTK.jar" - ref_hs38 = workdir + f"{dir_reference}/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa" - bcftools = "bcftools" - samtools = "samtools" - bgzip = workdir + f"{dir_software}/bgzip" - tabix = workdir + f"{dir_software}/tabix" - - # if not len(sys.argv) > 1: - # sys.exit("Please provide a single base recalibrated BAM file to this script and optional output directory\n") - - # input_BAM = sys.argv[1] - sample = basename(input_BAM).rsplit(".", 1)[0] - sample = sample.replace(".recal", "").replace(".bam.clean", "") - - # if len(sys.argv) > 2: - # resultsdir = sys.argv[2] - - # if len(sys.argv) > 3: - # numThreads = sys.argv[3] - - subprocess.run(["mkdir", "-p", f"{resultsdir}/temp"]) - - if not os.path.exists(input_BAM): - sys.exit(f"The BAM file specified does not exist, please double-check the path and try again [{input_BAM}]") - - if not os.path.exists(ref_bed): - sys.exit(f"The BED file specified does not exist, please double-check the path and try again [{ref_bed}]") - - if not os.path.exists(ref_bed + ".gz"): - # Sort and compress BED - subprocess.run(["sort", "-k", "1,1", "-k2,2n", ref_bed, ">", "tempAAA"]) - subprocess.run(["mv", "tempAAA", ref_bed]) - subprocess.run([bgzip, "-c", f"{ref_bed}" + " > " + f"{ref_bed}.gz"]) - - # Force tabix re-index - subprocess.run([tabix, "-p", "bed", f"{ref_bed}.gz"]) - - if not os.path.exists(f"{ref_bed}.gz.tbi"): - # tabix index - subprocess.run([tabix, "-p", "bed", f"{ref_bed}.gz"]) - - # Define the command to run GATK HaplotypeCaller - output_path = f"{resultsdir}/temp/{sample}.g.vcf" - if not Path(output_path).exists(): - runstr = f"java -jar {gatk}" - runstr += f" -T HaplotypeCaller -R {ref_hs38}" - runstr += f" -I {input_BAM} -nct {numThreads}" - runstr += f" --emitRefConfidence GVCF -L {ref_bed}" - runstr += f" -o {output_path}" - - # Run GATK HaplotypeCaller - # print(runstr) - os.system(runstr) - # exit() - - # Define the command to run GATK GenotypeGVCFs - output_path2 = f"{resultsdir}/temp/{sample}.genotypes.vcf" - if not Path(output_path2).exists(): - runstr2 = f"java -jar {gatk}" - runstr2 += f" -T GenotypeGVCFs -R {ref_hs38}" - runstr2 += f" --variant {output_path}" - runstr2 += f" -allSites -o {output_path2}" - runstr2 += " -stand_emit_conf 10 -stand_call_conf 30" - - # Run GATK GenotypeGVCFs - os.system(runstr2) - - # Add rsIDs to the VCF file using bcftools and bgzip - # os.system(f"cat {resultsdir}/temp/{sample}.genotypes.vcf | {bcftools} annotate -a {ref_bed}.gz -c CHROM,-,POS,ID | {bgzip} -c > {resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz") - vcf_path = f"{resultsdir}/temp/{sample}.genotypes.vcf" - bed_path = f"{ref_bed}.gz" - out_path = f"{resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz" - - cat_proc = subprocess.Popen(["cat", vcf_path], stdout=subprocess.PIPE) - annotate_proc = subprocess.Popen([bcftools, "annotate", "-a", bed_path, "-c", "CHROM,-,POS,ID"], - stdin=cat_proc.stdout, stdout=subprocess.PIPE) - bgzip_proc = subprocess.Popen(["bgzip", "-c"], stdin=annotate_proc.stdout, stdout=subprocess.PIPE) - - with open(out_path, "wb") as f: - f.write(bgzip_proc.communicate()[0]) - - ## Generate AFO Geno 38 file - gen_data = {} - counter_template = {} - - # with gzip.open(ref_bed, "rt") as IN: - with subprocess.Popen(["gzip", "-dcf", ref_bed], stdout=subprocess.PIPE).stdout as IN: - for line in IN: - line = line.strip() - if not line: - continue - - # data = line.split(b"\t") - data = line.decode().split("\t") - # gen_data.setdefault(data[0], {})[int(data[2])] = data - gen_data[(data[0], data[2])] = data - counter_template[(data[0], data[2])] = 0 - - VCFfilename = f"{resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz" - debugCounter = {} - - with subprocess.Popen( - f'gzip -dcf {VCFfilename} | grep -v "0/0:0:0:0:0,0,0" | grep -v LowQual | cut -f 1,2,4,5,10 | uniq', - shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc1, \ - open(f"{resultsdir}/temp/{sample}.afogeno38.txt", "w") as out_file: - # for line_bytes in proc1.stdout: - for line_bytes in proc1.stdout: - line = line_bytes.decode().rstrip() - - if not line or line.startswith("#"): - continue - - line = line.replace(",", "") - data = line.split("\t") - - tmp = data[4].split(":") - data[4] = tmp[0] - - genotype = ["NA", "NA"] - extra = "NA" - - pos = (data[0], data[1]) - - if data[4] != "./.": - a1, a2 = data[2], data[3] - genotype, extra = build_genotype(a1, a2, data[4]) - else: - # No reads there - debugCounter[pos] = debugCounter.get(pos, 0) + 1 - continue - - data.extend(genotype) - data.append(extra) - - # pos = (data[0], data[1]) - if pos in gen_data: - d2 = gen_data[pos] - out_data = "\t".join(d2) + "\t" + "\t".join(data) + "\n" - out_file.write(out_data) - else: - debugCounter[pos] = debugCounter.get(pos, 0) + 1 - - # Move the original VCF file to a new location - os.rename(VCFfilename, f"{resultsdir}/{sample}.genotypingVCF.vcf.gz") - - -if __name__ == '__main__': - fire.Fire(afogeno_generator_from_BAM) diff --git a/scripts/GenomeChronicler_afogeno_generator_fromVCF.py b/scripts/GenomeChronicler_afogeno_generator_fromVCF.py deleted file mode 100755 index 2676e88..0000000 --- a/scripts/GenomeChronicler_afogeno_generator_fromVCF.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python3 -import subprocess -import os -import sys -import gzip -from os.path import basename -from tqdm import tqdm -from pathlib import Path - -import fire - - -def build_genotype(a1, a2, gen): - alleles = [a1] + a2.split(",") - extras = [""] * len(alleles) - l1 = len(alleles[0]) - extra = "" - for i in range(1, len(alleles)): - extras[i] = "" - l2 = len(alleles[i]) - if l1 > 1 or l2 > 1: - if l2 > l1: - alleles[0] = "-" - alleles[i] = alleles[i][1:] - extras[i] = "I" - else: - alleles[0] = alleles[0][1:] - alleles[i] = "-" - extras[i] = "D" - res = [alleles[int(i)] for i in gen.split("/")] - extra = ";".join([extras[int(i)] for i in gen.split("/")]).strip(";") - return res, extra - - -def uniq(l): - return list(set(l)) - - -def afogeno_generator_from_VCF(input_VCF, sample, resultsdir="", - dir_reference='reference', dir_software='software', workdir="", ): - if 'SINGULARITY_NAME' in os.environ: - workdir = "/GenomeChronicler/" - - ref_bed = workdir + f"{dir_reference}/snps.19-114.unique.nochr.bed" - bcftools = "bcftools" - bgzip = workdir + f"{dir_software}/bgzip" - tabix = workdir + f"{dir_software}/tabix" - - # sample = basename(input_VCF).rsplit(".", 1)[0] - - subprocess.run(["mkdir", "-p", f"{resultsdir}/temp"]) - - if not os.path.exists(input_VCF): - sys.exit(f"The VCF file specified does not exist, please double-check the path and try again [{input_VCF}]") - - if not os.path.exists(ref_bed): - sys.exit(f"The BED file specified does not exist, please double-check the path and try again [{ref_bed}]") - - if not os.path.exists(ref_bed + ".gz"): - # Sort and compress BED - subprocess.run(["sort", "-k", "1,1", "-k2,2n", ref_bed, ">", "tempAAA"]) - subprocess.run(["mv", "tempAAA", ref_bed]) - subprocess.run([bgzip, "-c", f"{ref_bed}" + " > " + f"{ref_bed}.gz"]) - - # Force tabix re-index - subprocess.run([tabix, "-p", "bed", f"{ref_bed}.gz"]) - - if not os.path.exists(f"{ref_bed}.gz.tbi"): - # tabix index - subprocess.run([tabix, "-p", "bed", f"{ref_bed}.gz"]) - - # Add rsIDs to the VCF file using bcftools and bgzip - # os.system(f"cat {resultsdir}/temp/{sample}.genotypes.vcf | {bcftools} annotate -a {ref_bed}.gz -c CHROM,-,POS,ID | {bgzip} -c > {resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz") - # vcf_path = f"{resultsdir}/temp/{sample}.genotypes.vcf" - - vcf_path = input_VCF - bed_path = f"{ref_bed}.gz" - out_path = f"{resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz" - - # subprocess.run(f"cat {vcf_path} | {bcftools} annotate -a {bed_path} -c CHROM,-,POS,ID | {bgzip} -c > {out_path}", shell=True) - - cat_proc = subprocess.Popen(["cat", vcf_path], stdout=subprocess.PIPE) - annotate_proc = subprocess.Popen([bcftools, "annotate", "-a", bed_path, "-c", "CHROM,-,POS,ID"], - stdin=cat_proc.stdout, stdout=subprocess.PIPE) - bgzip_proc = subprocess.Popen(["bgzip", "-c"], stdin=annotate_proc.stdout, stdout=subprocess.PIPE) - - with open(out_path, "wb") as f: - f.write(bgzip_proc.communicate()[0]) - - # Generate AFO Geno 38 file - gen_data = {} - counter_template = {} - - # with gzip.open(ref_bed, "rt") as IN: - with subprocess.Popen(["gzip", "-dcf", ref_bed], stdout=subprocess.PIPE).stdout as IN: - for line in IN: - line = line.strip() - if not line: - continue - - # data = line.split(b"\t") - data = line.decode().split("\t") - # gen_data.setdefault(data[0], {})[int(data[2])] = data - gen_data[(data[0], data[2])] = data - counter_template[(data[0], data[2])] = 0 - - VCFfilename = f"{resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz" - debugCounter = {} - - with subprocess.Popen( - f'gzip -dcf {VCFfilename} | grep -v "0/0:0:0:0:0,0,0" | grep -v LowQual | cut -f 1,2,4,5,10 | uniq', - shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc1, \ - open(f"{resultsdir}/temp/{sample}.afogeno38.txt", "w") as out_file: - # for line_bytes in proc1.stdout: - for line_bytes in proc1.stdout: - line = line_bytes.decode().rstrip() - - if not line or line.startswith("#"): - continue - - line = line.replace(",", "") - data = line.split("\t") - - tmp = data[4].split(":") - data[4] = tmp[0] - - genotype = ["NA", "NA"] - extra = "NA" - - pos = (data[0], data[1]) - - if data[4] != "./.": - a1, a2 = data[2], data[3] - genotype, extra = build_genotype(a1, a2, data[4]) - else: - # No reads there - debugCounter[pos] = debugCounter.get(pos, 0) + 1 - continue - - data.extend(genotype) - data.append(extra) - - # pos = (data[0], data[1]) - if pos in gen_data: - d2 = gen_data[pos] - out_data = "\t".join(d2) + "\t" + "\t".join(data) + "\n" - out_file.write(out_data) - else: - debugCounter[pos] = debugCounter.get(pos, 0) + 1 - - # Move the original VCF file to a new location - os.rename(VCFfilename, f"{resultsdir}/{sample}.genotypingVCF.vcf.gz") - - -if __name__ == '__main__': - fire.Fire(afogeno_generator_from_VCF) diff --git a/scripts/GenomeChronicler_ancestry_generator_fromBAM.py b/scripts/GenomeChronicler_ancestry_generator_fromBAM.py deleted file mode 100755 index 78cbbe4..0000000 --- a/scripts/GenomeChronicler_ancestry_generator_fromBAM.py +++ /dev/null @@ -1,299 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -import subprocess -from os.path import basename - -from pathlib import Path -import fire - -def _get_list_of_snps_from_1kGP(initialBIM, sample, resultsdir, hasCHRflag, bgzip, tabix): - """ - Get list of SNPs from 1kGP - - Input files: - - initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim" - Output files: - - f"{dir}temp/1kGP_GRCh38_nonAT_CG.bed.gz" - - f"{dir}temp/1kGP_GRCh38_nonAT_CG.bed.gz.tbi" - - """ - output_path = f"{resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed" - if Path(output_path).exists(): - return - with open(initialBIM, "r") as inf, open(output_path, "w") as outf: - outf.write("#CHROM\tST0\tPOS\tID\n") - for line in inf: - a = line.strip().split("\t") - if hasCHRflag == 1 and not a[0].startswith("chr"): - a[0] = "chr" + a[0] - pp = a[0] - pp = pp.replace('chr','') - if int(pp) > 22: - continue - if a[4] == 'T' and a[5] == 'A': - continue - if a[4] == 'A' and a[5] == 'T': - continue - if a[4] == 'C' and a[5] == 'G': - continue - if a[4] == 'G' and a[5] == 'C': - continue - outf.write(f"{a[0]}\t{int(a[3])-1}\t{a[3]}\t{a[1]}\n") - os.system(f"{bgzip} -c {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed > {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed.gz") - os.system(f"{tabix} -p bed {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed.gz") - - -def _get_gvcf(bam, resultsdir, sample, gatk, ref_hs38, numThreads, bcftools, bgzip): - """ - Convert BAM to a gvcf that contains only the 1kgp snps filtered above - - Input files: - - f"{dir}bam/{sample}.bam" - - {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed - - - {ref_hs38} - - Output files: - - f"{dir}temp/{sample}.g.vcf" - - f"{dir}temp/{sample}.genotypes.vcf" - - f"{dir}temp/{sample}.rsIDs.gvcf.gz" - - """ - tmp_dir = f"{resultsdir}/temp" - - output_g_path = f"{tmp_dir}/{sample}.ancestry.g.vcf" - if not Path(output_g_path).exists(): - runstr = f"java -Xmx40g -Djava.io.tmpdir={tmp_dir}/tmpdir -jar {gatk}" - runstr += f" -T HaplotypeCaller -R {ref_hs38}" - runstr += f" -I {bam} -nct {numThreads}" - runstr += f" --emitRefConfidence GVCF -L {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed" - runstr += f" -o {output_g_path}" - os.system(runstr) - - output_genotypes_path = f"{tmp_dir}/{sample}.ancestry.genotypes.vcf" - if not Path(output_genotypes_path).exists(): - # return - runstr2 = f"java -Xmx40g -Djava.io.tmpdir={tmp_dir}/tmpdir -jar {gatk}" - runstr2 += f" -T GenotypeGVCFs -R {ref_hs38}" - runstr2 += f" --variant {output_g_path}" - runstr2 += f" -allSites -o {output_genotypes_path}" - runstr2 += " -stand_emit_conf 10 -stand_call_conf 30" - os.system(runstr2) - - output_path = f"{tmp_dir}/{sample}.ancestry.rsIDs.gvcf.gz" - if not Path(output_path).exists(): - os.system(f"cat {output_genotypes_path} | {bcftools} annotate -a {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed.gz -c CHROM,-,POS,ID | {bgzip} -c > {output_path}") - - -def _gvcf_to_plink(plink, resultsdir, sample): - """ - Convert gvcf to plink files (bed, bim, fam) - - Input files: - - {tmp_dir}/{sample}.ancestry.rsIDs.gvcf.gz - - Output files: - - {tmp_dir}/{sample}.bed - - {tmp_dir}/{sample}.bim - - {tmp_dir}/{sample}.fam - - {tmp_dir}/{sample}.fam.mod - - {tmp_dir}/{sample}.log (update) - - {tmp_dir}/{sample}.nosex - - """ - tmp_dir = f"{resultsdir}/temp" - output_path = f"{tmp_dir}/{sample}.fam" - if Path(output_path).exists(): - return - # convert gvcf to plink (generate bed, bim, fam) - runstr1q = f"{plink} --biallelic-only --vcf-require-gt" - runstr1q += f" --vcf {tmp_dir}/{sample}.ancestry.rsIDs.gvcf.gz" - runstr1q += f" --out {tmp_dir}/{sample}" - runstr1q += " --allow-extra-chr --make-bed --double-id" - subprocess.run(runstr1q, shell=True) - - # the vcf has the SN (sample name) from the BAM, so when converts that to plink it gives me problems if there is a "_" in the name. - # So I'm changin it and I put the same $sample name as FIID and IID. - # Also, if the IID is not the same as $sample, the R script will not find the sameple and will not be plotted - cmd = f"awk '{{ $1 = \"{sample}\"; print }}' {tmp_dir}/{sample}.fam > {tmp_dir}/{sample}.fam.mod" - subprocess.run(cmd, shell=True) - cmd2 = f"awk '{{ $2 = \"{sample}\"; print }}' {tmp_dir}/{sample}.fam.mod > {tmp_dir}/{sample}.fam" - subprocess.run(cmd2, shell=True) - - -def _merge_pgp_1kGP(plink, resultsdir, sample, initialAnc): - """ - Merge the pgp and 1kGP files and get the SNPs that disagree between datasets - - Four groups of files are generated: - - Input files: - - {tmp_dir}/{sample}.bed - - {tmp_dir}/{sample}.bim - - {tmp_dir}/{sample}.fam - - {tmp_dir}/{sample}.fam.mod - - initialAnc f"{dir}reference/1kGP_GRCh38_exome" - - Output files: - - {tmp_dir}/{sample}_1kGP_0 (-merge.missnp, -merge.fam, .log) - - {tmp_dir}/1kGP_2 (.bed, .bim, .fam, .log, .nosex) - - {tmp_dir}/{sample}_2 (.bed, .bim, .fam, .log, .nosex) - - {tmp_dir}/{sample}_1kGP (.bed, .bim, .fam, .log, .nosex) - - """ - tmp_dir = f"{resultsdir}/temp" - # output_path = f"{tmp_dir}/{sample}.fam" - # if Path(output_path).exists(): - # return - # Merge the pgp and 1kGP files and get the SNPs that disagree between datasets - output_path = f"{tmp_dir}/{sample}_1kGP_0-merge.fam" - if not Path(output_path).exists(): - runstr3 = f"{plink} --bfile {tmp_dir}/{sample} --bmerge {initialAnc} --out {tmp_dir}/{sample}_1kGP_0 --geno 0 --allow-extra-chr --make-bed" - os.system(runstr3) - - # Remove discarded SNPs from the pgp and 1kGP file - output_path = f"{tmp_dir}/1kGP_2.bed" - if not Path(output_path).exists(): - runstr1z = f"{plink} --bfile {initialAnc} --exclude {tmp_dir}/{sample}_1kGP_0-merge.missnp --out {tmp_dir}/1kGP_2 --make-bed --allow-extra-chr" - os.system(runstr1z) - - output_path = f"{tmp_dir}/{sample}_2.bed" - if not Path(output_path).exists(): - runstr1z2 = f"{plink} --bfile {tmp_dir}/{sample} --exclude {tmp_dir}/{sample}_1kGP_0-merge.missnp --out {tmp_dir}/{sample}_2 --make-bed --allow-extra-chr" - os.system(runstr1z2) - - # Merge the pgp and 1kGP files again - output_path = f"{tmp_dir}/{sample}_1kGP.bed" - if not Path(output_path).exists(): - runstr3a = f"{plink} --bfile {tmp_dir}/{sample}_2 --bmerge {tmp_dir}/1kGP_2 --out {tmp_dir}/{sample}_1kGP --geno 0 --allow-extra-chr --make-bed" - os.system(runstr3a) - - -def _subset_unlinked_snps(plink, resultsdir, sample): - """ - Choose a subset of filtered and independent (unlinked) SNPs - - Input files: - - {tmp_dir}/{sample}_1kGP* - - Output files: - - {tmp_dir}/snps_to_prune (.log, .prune.in, .prune.out, .nosex) - - {tmp_dir}/{sample}_1kGP_pruned (.bed, .bim, .fam, .log, .nosex) - - """ - tmp_dir = f"{resultsdir}/temp" - - # Choose a subset of filtered and independent (unlinked) SNPs - - # Select a subgroup of unlinked SNPs, by pruning those with r2 > 0.1 using 100-SNP windows shifted at 5-SNP intervals. - output_path = f"{tmp_dir}/snps_to_prune.prune.in" - if not Path(output_path).exists(): - runstr4 = f"{plink} --bfile {tmp_dir}/{sample}_1kGP --out {tmp_dir}/snps_to_prune --indep-pairwise 100 5 0.1" - os.system(runstr4) - - # Remove the pruned SNPs from the dataset and set the missingness threshold to 0.1 - output_path = f"{tmp_dir}/{sample}_1kGP_pruned.bed" - if not Path(output_path).exists(): - runstr5 = f"{plink} --bfile {tmp_dir}/{sample}_1kGP --out {tmp_dir}/{sample}_1kGP_pruned --extract {tmp_dir}/snps_to_prune.prune.in --make-bed --mind 0.1" - os.system(runstr5) - - -def ancestry_generator_from_BAM(bam, resultsdir="", numThreads=4, - dir_reference='reference', dir_software='software', workdir="", - ): - - dir = "" - if 'SINGULARITY_NAME' in os.environ: - dir = "/GenomeChronicler/" - - # resultsdir = dir - plink = f"{dir}software/plink" - gatk = f"{dir}software/GenomeAnalysisTK.jar" - ref_hs38 = f"{dir}reference/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa" - initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim" - initialAnc = f"{dir}reference/1kGP_GRCh38_exome" - bcftools = "bcftools" - samtools = "samtools" - bgzip = "bgzip" - tabix = "tabix" - - hasCHRflag = 0 - numThreads = 4 - - # if not sys.argv: - # print("Please provide a single base recalibrated BAM file to this script") - # sys.exit(1) - - # bam = sys.argv[0] - sample = basename(bam) - sample = sample.rsplit(".", 1)[0] - sample = sample.replace(".recal", "") - sample = sample.replace(".bam.clean", "") - - # if len(sys.argv) > 1: - # resultsdir = sys.argv[1] - - # if len(sys.argv) > 2: - # numThreads = sys.argv[2] - - os.system(f"mkdir -p {resultsdir}/temp") - - # Also check for the need to samtools index this stuff - # java -jar software/picard.jar CreateSequenceDictionary R=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa O=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.dict - - # ------------------------------------------ - # -- I had already preprocessed the 1kGP data to reduce it to a PLINK file with - # -- only SNPs from exome (914577 total) - # -- But from these, here I remove all the A/T or C/G SNPs to make sure that there - # -- will not be artifacts due to reporting SNPS i different stransd ib both datasets - # -- I also remove sexual chromosomes - # ------------------------------------------ - - # _get_list_of_snps_from_1kGP() - _get_list_of_snps_from_1kGP(initialBIM, sample, resultsdir, hasCHRflag, bgzip, tabix) - - # ------------------------------------------ - # -- Convert BAM to a gvcf that contains only - # - the 1kgp snps filtered above - # ------------------------------------------ - - # _get_gvcf() - _get_gvcf(bam, resultsdir, sample, gatk, ref_hs38, numThreads, bcftools, bgzip) - - # ------------------------------------------ - # -- Convert GVCF data to PLINK format - # ------------------------------------------ - - # _gvcf_to_plink() - _gvcf_to_plink(plink, resultsdir, sample) - - # ------------------------------------------ - # -- Merge gvcf with 1kGP data. I need to merge the files twice, the first one will fail if there are SNPs with - # -- different alleles in both datasets. The second merge will exclude these SNPs - # ------------------------------------------ - - # _merge_pgp_1kGP() - _merge_pgp_1kGP(plink, resultsdir, sample, initialAnc) - - # ------------------------------------------ - # -- Reduce the merged dataset further by removing SPs in LD - # ------------------------------------------ - - # _subset_unlinked_snps() - _subset_unlinked_snps(plink, resultsdir, sample) - # ------------------------------------------ - # -- Perform PCA - # ------------------------------------------ - tmp_dir = f"{resultsdir}/temp" - output_path = f"{tmp_dir}/{sample}_1kGP_pruned_pca_20.eigenvec" - if not Path(output_path).exists(): - runstr6 = f"{plink}" - runstr6 += f" --bfile {tmp_dir}/{sample}_1kGP_pruned" - runstr6 += f" --out {tmp_dir}/{sample}_1kGP_pruned_pca_20" - runstr6 += " --pca" - subprocess.run(runstr6, shell=True) - - -if __name__ == '__main__': - fire.Fire(ancestry_generator_from_BAM) \ No newline at end of file diff --git a/scripts/GenomeChronicler_ancestry_generator_fromVCF.py b/scripts/GenomeChronicler_ancestry_generator_fromVCF.py deleted file mode 100755 index 5554115..0000000 --- a/scripts/GenomeChronicler_ancestry_generator_fromVCF.py +++ /dev/null @@ -1,229 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -import subprocess -from os.path import basename - -from pathlib import Path -import fire - -# get list of SNPs from 1kGP -def _get_list_of_snps_from_1kGP(initialBIM, sample, resultsdir, hasCHRflag, bgzip, tabix): - output_path = f"{resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed" - if Path(output_path).exists(): - return - with open(initialBIM, "r") as inf, open(output_path, "w") as outf: - outf.write("#CHROM\tST0\tPOS\tID\n") - for line in inf: - a = line.strip().split("\t") - if hasCHRflag == 1 and not a[0].startswith("chr"): - a[0] = "chr" + a[0] - pp = a[0] - pp = pp.replace('chr','') - if int(pp) > 22: - continue - if a[4] == 'T' and a[5] == 'A': - continue - if a[4] == 'A' and a[5] == 'T': - continue - if a[4] == 'C' and a[5] == 'G': - continue - if a[4] == 'G' and a[5] == 'C': - continue - outf.write(f"{a[0]}\t{int(a[3])-1}\t{a[3]}\t{a[1]}\n") - os.system(f"{bgzip} -c {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed > {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed.gz") - os.system(f"{tabix} -p bed {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed.gz") - - -# convert BAM to a gvcf that contains only the 1kgp snps filtered above -def _get_gvcf(vcf, resultsdir, sample, bcftools, bgzip): - tmp_dir = f"{resultsdir}/temp" - # output_path = f"{tmp_dir}/{sample}.rsIDs.gvcf.gz" - # if Path(output_path).exists(): - # return - # runstr = f"java -Xmx40g -Djava.io.tmpdir={tmp_dir}/tmpdir -jar {gatk}" - # runstr += f" -T HaplotypeCaller -R {ref_hs38}" - # runstr += f" -I {bam} -nct {numThreads}" - # runstr += f" --emitRefConfidence GVCF -L {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed" - # runstr += f" -o {tmp_dir}/{sample}.g.vcf" - # os.system(runstr) - # runstr2 = f"java -Xmx40g -Djava.io.tmpdir={tmp_dir}/tmpdir -jar {gatk}" - # runstr2 += f" -T GenotypeGVCFs -R {ref_hs38}" - # runstr2 += f" --variant {tmp_dir}/{sample}.g.vcf" - # runstr2 += f" -allSites -o {tmp_dir}/{sample}.genotypes.vcf" - # runstr2 += " -stand_emit_conf 10 -stand_call_conf 30" - # os.system(runstr2) - os.system(f"cat {vcf} | {bcftools} annotate -a {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed.gz -c CHROM,-,POS,ID | {bgzip} -c > {tmp_dir}/{sample}.rsIDs.gvcf.gz") - - -def _gvcf_to_plink(plink, resultsdir, sample): - tmp_dir = f"{resultsdir}/temp" - output_path = f"{tmp_dir}/{sample}.fam" - if Path(output_path).exists(): - return - # convert gvcf to plink - runstr1q = f"{plink} --biallelic-only --vcf-require-gt" - runstr1q += f" --vcf {tmp_dir}/{sample}.rsIDs.gvcf.gz" - runstr1q += f" --out {tmp_dir}/{sample}" - runstr1q += " --allow-extra-chr --make-bed --double-id" - subprocess.run(runstr1q, shell=True) - - # the vcf has the SN (sample name) from the BAM, so when converts that to plink it gives me problems if there is a "_" in the name. - # So I'm changin it and I put the same $sample name as FIID and IID. - # Also, if the IID is not the same as $sample, the R script will not find the sameple and will not be plotted - cmd = f"awk '{{ $1 = \"{sample}\"; print }}' {tmp_dir}/{sample}.fam > {tmp_dir}/{sample}.fam.mod" - subprocess.run(cmd, shell=True) - cmd2 = f"awk '{{ $2 = \"{sample}\"; print }}' {tmp_dir}/{sample}.fam.mod > {tmp_dir}/{sample}.fam" - subprocess.run(cmd2, shell=True) - -# def _gvcf_to_plink(plink, resultsdir, sample): -# tmp_dir = f"{resultsdir}/temp" -# # Convert gvcf to plink -# runstr1q = f"{plink} --biallelic-only --vcf-require-gt --vcf {tmp_dir}/{sample}.rsIDs.gvcf.gz --out {tmp_dir}/{sample} --allow-extra-chr --make-bed --double-id" -# os.system(runstr1q) - -# # Change sample names to avoid errors and inconsistencies -# cmd = f"awk '{{\$1 = \"{sample}\"; print}}' {tmp_dir}/{sample}.fam > {tmp_dir}/{sample}.fam.mod" -# os.system(cmd) -# cmd2 = f"awk '{{\$2 = \"{sample}\"; print}}' {tmp_dir}/{sample}.fam.mod > {tmp_dir}/{sample}.fam" -# os.system(cmd2) - - -def _merge_pgp_1kGP(plink, resultsdir, sample, initialAnc): - tmp_dir = f"{resultsdir}/temp" - # output_path = f"{tmp_dir}/{sample}.fam" - # if Path(output_path).exists(): - # return - # Merge the pgp and 1kGP files and get the SNPs that disagree between datasets - runstr3 = f"{plink} --bfile {tmp_dir}/{sample} --bmerge {initialAnc} --out {tmp_dir}/{sample}_1kGP_0 --geno 0 --allow-extra-chr --make-bed" - os.system(runstr3) - - # Remove discarded SNPs from the pgp and 1kGP file - runstr1z = f"{plink} --bfile {initialAnc} --exclude {tmp_dir}/{sample}_1kGP_0-merge.missnp --out {tmp_dir}/1kGP_2 --make-bed --allow-extra-chr" - os.system(runstr1z) - runstr1z2 = f"{plink} --bfile {tmp_dir}/{sample} --exclude {tmp_dir}/{sample}_1kGP_0-merge.missnp --out {tmp_dir}/{sample}_2 --make-bed --allow-extra-chr" - os.system(runstr1z2) - - # Merge the pgp and 1kGP files again - runstr3a = f"{plink} --bfile {tmp_dir}/{sample}_2 --bmerge {tmp_dir}/1kGP_2 --out {tmp_dir}/{sample}_1kGP --geno 0 --allow-extra-chr --make-bed" - os.system(runstr3a) - - -def _subset_unlinked_snps(plink, resultsdir, sample): - tmp_dir = f"{resultsdir}/temp" - output_path = f"{tmp_dir}/{sample}_1kGP_pruned.bed" - if Path(output_path).exists(): - return - - # Choose a subset of filtered and independent (unlinked) SNPs - - # Select a subgroup of unlinked SNPs, by pruning those with r2 > 0.1 using 100-SNP windows shifted at 5-SNP intervals. - runstr4 = f"{plink} --bfile {tmp_dir}/{sample}_1kGP --out {tmp_dir}/snps_to_prune --indep-pairwise 100 5 0.1" - os.system(runstr4) - - # Remove the pruned SNPs from the dataset and set the missingness threshold to 0.1 - runstr5 = f"{plink} --bfile {tmp_dir}/{sample}_1kGP --out {tmp_dir}/{sample}_1kGP_pruned --extract {tmp_dir}/snps_to_prune.prune.in --make-bed --mind 0.1" - os.system(runstr5) - - -def ancestry_generator_from_VCF(vcf, sample, resultsdir=""): - - dir = "" - if 'SINGULARITY_NAME' in os.environ: - dir = "/GenomeChronicler/" - - # resultsdir = dir - plink = f"{dir}software/plink" - # gatk = f"{dir}software/GenomeAnalysisTK.jar" - # ref_hs38 = f"{dir}reference/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa" - initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim" - initialAnc = f"{dir}reference/1kGP_GRCh38_exome" - bcftools = "bcftools" - # samtools = "samtools" - bgzip = "bgzip" - tabix = "tabix" - - hasCHRflag = 0 - numThreads = 4 - - # if not sys.argv: - # print("Please provide a single base recalibrated VCF file to this script") - # sys.exit(1) - - # VCF = sys.argv[0] - # sample = basename(vcf) - # sample = sample.rsplit(".", 1)[0] - # sample = sample.replace(".recal", "") - # sample = sample.replace(".bam.clean", "") - - # if len(sys.argv) > 1: - # resultsdir = sys.argv[1] - - # if len(sys.argv) > 2: - # numThreads = sys.argv[2] - - os.system(f"mkdir -p {resultsdir}/temp") - - # Also check for the need to samtools index this stuff - # java -jar software/picard.jar CreateSequenceDictionary R=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa O=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.dict - - # ------------------------------------------ - # -- I had already preprocessed the 1kGP data to reduce it to a PLINK file with - # -- only SNPs from exome (914577 total) - # -- But from these, here I remove all the A/T or C/G SNPs to make sure that there - # -- will not be artifacts due to reporting SNPS i different stransd ib both datasets - # -- I also remove sexual chromosomes - # ------------------------------------------ - - # _get_list_of_snps_from_1kGP() - _get_list_of_snps_from_1kGP(initialBIM, sample, resultsdir, hasCHRflag, bgzip, tabix) - - # ------------------------------------------ - # -- Convert BAM to a gvcf that contains only - # - the 1kgp snps filtered above - # ------------------------------------------ - - # _get_gvcf() - # _get_gvcf(vcf, resultsdir, sample, gatk, ref_hs38, numThreads, bcftools, bgzip) - _get_gvcf(vcf, resultsdir, sample, bcftools, bgzip) - - # ------------------------------------------ - # -- Convert GVCF data to PLINK format - # ------------------------------------------ - - # _gvcf_to_plink() - _gvcf_to_plink(plink, resultsdir, sample) - - # ------------------------------------------ - # -- Merge gvcf with 1kGP data. I need to merge the files twice, the first one will fail if there are SNPs with - # -- different alleles in both datasets. The second merge will exclude these SNPs - # ------------------------------------------ - - # _merge_pgp_1kGP() - _merge_pgp_1kGP(plink, resultsdir, sample, initialAnc) - # print('test') - # exit() - - # ------------------------------------------ - # -- Reduce the merged dataset further by removing SPs in LD - # ------------------------------------------ - - # _subset_unlinked_snps() - _subset_unlinked_snps(plink, resultsdir, sample) - # ------------------------------------------ - # -- Perform PCA - # ------------------------------------------ - tmp_dir = f"{resultsdir}/temp" - output_path = f"{tmp_dir}/{sample}_1kGP_pruned_pca_20.eigenvec" - if Path(output_path).exists(): - return - runstr6 = f"{plink}" - runstr6 += f" --bfile {tmp_dir}/{sample}_1kGP_pruned" - runstr6 += f" --out {tmp_dir}/{sample}_1kGP_pruned_pca_20" - runstr6 += " --pca" - subprocess.run(runstr6, shell=True) - - -if __name__ == '__main__': - fire.Fire(ancestry_generator_from_VCF) \ No newline at end of file diff --git a/scripts/GenomeChronicler_quickFilterFinalReportTables.py b/scripts/GenomeChronicler_quickFilterFinalReportTables.py deleted file mode 100755 index 1376c8a..0000000 --- a/scripts/GenomeChronicler_quickFilterFinalReportTables.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 - -from pathlib import Path - -import pandas as pd -import fire - - -def filter_final_report_csv(csv_input_path, csv_output_path=None, DEBUG=False): - ''' - - ''' - # read csv - df = pd.read_csv(csv_input_path,dtype={'Mag.':'str'}) - - # filter - if len(df.columns) <= 4: - if DEBUG: - print("WARNING: No links found") - return - df = df[~(df['Mag.'].astype('float')==0)] - df = df[~(df['Identifier'].str.startswith('{rs')|df['Identifier'].str.startswith('{i'))] - - # export - if csv_output_path is None: # inplace - # warning, inplace operation - csv_output_path = csv_input_path - Path(csv_output_path).parent.mkdir(parents=True, exist_ok=True) - df.to_csv(csv_output_path, index=False) - - -if __name__ == '__main__': - fire.Fire(filter_final_report_csv) diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/afogeno_generator.py b/scripts/afogeno_generator.py new file mode 100755 index 0000000..9f01ebe --- /dev/null +++ b/scripts/afogeno_generator.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +import os +import subprocess +import sys +from pathlib import Path + +import fire + +if __name__ == '__main__': + sys.path.append(os.path.abspath(os.getcwd())) +from scripts.ancestry_generator import get_gvcf + + +def build_genotype(a1, a2, gen): + alleles = [a1] + a2.split(",") + extras = [""] * len(alleles) + l1 = len(alleles[0]) + extra = "" + for i in range(1, len(alleles)): + extras[i] = "" + l2 = len(alleles[i]) + if l1 > 1 or l2 > 1: + if l2 > l1: + alleles[0] = "-" + alleles[i] = alleles[i][1:] + extras[i] = "I" + else: + alleles[0] = alleles[0][1:] + alleles[i] = "-" + extras[i] = "D" + res = [alleles[int(i)] for i in gen.split("/")] + extra = ";".join([extras[int(i)] for i in gen.split("/")]).strip(";") + return res, extra + + +# def uniq(l): +# return list(set(l)) + +def prepare_snps_ref_bed(ref_input_path, output_dir, bgzip="bgzip", tabix="tabix"): + """ + Sort and compress BED file + - Sort by chromosome and position + - Compress with bgzip + - Index with tabix + + Input: .bed + Output: .bed.gz, .bed.gz.tbi + + Parameters + ---------- + ref_input_path + output_dir + bgzip + tabix + + Returns + ------- + + """ + ref_input_path = Path(ref_input_path) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + ref_path = output_dir / f"{ref_input_path.name}" + gz_path = output_dir / f"{ref_input_path.name}.gz" + gz_tbi_path = output_dir / f"{ref_input_path.name}.gz.tbi" + + # Sort and compress BED + if not gz_path.exists(): + # subprocess.run(["sort", "-k", "1,1", "-k2,2n", str(ref_input_path), ">", str(ref_path)]) + # subprocess.run(["mv", "tempAAA", ref_path]) + # subprocess.run([bgzip, "-c", f"{ref_path}" + " > " + f"{gz_path}"]) + cmd = f"sort -k 1,1 -k2,2n {ref_input_path} > {ref_path}" + subprocess.run(cmd, shell=True) + cmd = f"bgzip -c {ref_path} > {gz_path}" + subprocess.run(cmd, shell=True) + + # Force tabix re-index + if not gz_tbi_path.exists(): + # tabix index + # subprocess.run([tabix, "-p", "bed", f"{gz_path}"]) + cmd = f"tabix -p bed {gz_path}" + subprocess.run(cmd, shell=True) + +def generate_afogeno_38(vcf_path, ref_bed, output_dir): + ## Generate AFO Geno 38 file + gen_data = {} + counter_template = {} + + # with gzip.open(ref_bed, "rt") as IN: + with subprocess.Popen(["gzip", "-dcf", ref_bed], stdout=subprocess.PIPE).stdout as IN: + for line in IN: + line = line.strip() + if not line: + continue + + # data = line.split(b"\t") + data = line.decode().split("\t") + # gen_data.setdefault(data[0], {})[int(data[2])] = data + gen_data[(data[0], data[2])] = data + counter_template[(data[0], data[2])] = 0 + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + # output_vcf_path = output_dir/f"afogeno.genotypes.rsIDs.vcf.gz" + afogeno_path = output_dir/f"afogeno38.txt" + debugCounter = {} + + with subprocess.Popen( + f'gzip -dcf {vcf_path} | grep -v "0/0:0:0:0:0,0,0" | grep -v LowQual | cut -f 1,2,4,5,10 | uniq', + shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc1, \ + open(afogeno_path, "w") as out_file: + # for line_bytes in proc1.stdout: + for line_bytes in proc1.stdout: + line = line_bytes.decode().rstrip() + + if not line or line.startswith("#"): + continue + + line = line.replace(",", "") + data = line.split("\t") + + tmp = data[4].split(":") + data[4] = tmp[0] + + genotype = ["NA", "NA"] + extra = "NA" + + pos = (data[0], data[1]) + + if data[4] != "./.": + a1, a2 = data[2], data[3] + genotype, extra = build_genotype(a1, a2, data[4]) + else: + # No reads there + debugCounter[pos] = debugCounter.get(pos, 0) + 1 + continue + + data.extend(genotype) + data.append(extra) + + # pos = (data[0], data[1]) + if pos in gen_data: + d2 = gen_data[pos] + out_data = "\t".join(d2) + "\t" + "\t".join(data) + "\n" + out_file.write(out_data) + else: + debugCounter[pos] = debugCounter.get(pos, 0) + 1 + + +def generate_afogeno_pipeline(bam_or_vcf, output_dir="", numThreads=4, + dir_reference='reference', dir_software='software', workdir="", + ): + if 'SINGULARITY_NAME' in os.environ: + workdir = "/GenomeChronicler/" + workdir = Path(workdir) + + # resultsdir = workdir + ref_bed = workdir / f"{dir_reference}/snps.19-114.unique.nochr.bed" + gatk = workdir / f"{dir_software}/GenomeAnalysisTK.jar" + ref_hs38 = workdir / f"{dir_reference}/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa" + bcftools = "bcftools" + bgzip = "bgzip" + tabix = "tabix" + + output_dir = Path(output_dir) + plink_snps_dir = output_dir / 'prepare_snps_ref_bed' + prepare_snps_ref_bed(ref_bed, plink_snps_dir, bgzip, tabix) + + ref_snps = plink_snps_dir/'snps.19-114.unique.nochr.bed' + vcf_dir = output_dir/'vcf_afogeno' + get_gvcf(bam_or_vcf, vcf_dir, ref_hs38, ref_snps, 'afogeno', gatk, numThreads, bcftools, bgzip) + + ## Generate AFO Geno 38 file + vcf_afogeno_path = vcf_dir / f'afogeno.rsIDs.gvcf.gz' + generate_afogeno_38(vcf_afogeno_path, ref_bed, output_dir) + + +funcs = { + 'prepare_snps_ref_bed': prepare_snps_ref_bed, + 'generate_afogeno_38': generate_afogeno_38, + 'generate_afogeno_pipeline': generate_afogeno_pipeline, +} + + +class Tools(object): + def __init__(self): + super(Tools, self).__init__() + + +if __name__ == '__main__': + for k, v in funcs.items(): + setattr(Tools, k, staticmethod(v)) + fire.Fire(Tools) diff --git a/scripts/ancestry_generator.py b/scripts/ancestry_generator.py new file mode 100755 index 0000000..e4cc1dd --- /dev/null +++ b/scripts/ancestry_generator.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 + +import os +import subprocess +from pathlib import Path + +import fire + + +def get_list_of_snps_from_1kGP(initialBIM, output_dir, hasCHRflag=0, bgzip='bgzip', tabix='tabix'): + """ + Get list of SNPs from 1kGP + + Input files: + - initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim" + Output files: + - f"{dir}temp/1kGP_GRCh38_nonAT_CG.bed.gz" + - f"{dir}temp/1kGP_GRCh38_nonAT_CG.bed.gz.tbi" + + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir/f"1kGP_GRCh38_nonAT_CG.bed" + if Path(output_path).exists(): + return + with open(initialBIM, "r") as inf, open(output_path, "w") as outf: + outf.write("#CHROM\tST0\tPOS\tID\n") + for line in inf: + a = line.strip().split("\t") + if hasCHRflag == 1 and not a[0].startswith("chr"): + a[0] = "chr" + a[0] + pp = a[0] + pp = pp.replace('chr','') + if int(pp) > 22: + continue + if a[4] == 'T' and a[5] == 'A': + continue + if a[4] == 'A' and a[5] == 'T': + continue + if a[4] == 'C' and a[5] == 'G': + continue + if a[4] == 'G' and a[5] == 'C': + continue + outf.write(f"{a[0]}\t{int(a[3])-1}\t{a[3]}\t{a[1]}\n") + os.system(f"{bgzip} -c {output_dir}/1kGP_GRCh38_nonAT_CG.bed > {output_dir}/1kGP_GRCh38_nonAT_CG.bed.gz") + os.system(f"{tabix} -p bed {output_dir}/1kGP_GRCh38_nonAT_CG.bed.gz") + + +def get_gvcf(vcf_or_bam, output_dir, ref_hs38, ref_1kGP, name='ancestry', + gatk='software/GenomeAnalysisTK.jar', numThreads=4, bcftools='bcftools', bgzip='bgzip'): + """ + Convert BAM to a gvcf that contains only the 1kgp snps filtered above + + Input files: + - f"{dir}bam/clean.bam" + + - {ref_hs38} reference/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa + - {ref_1kGP} {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed + + Output files: + - f"{dir}temp/g.vcf" + - f"{dir}temp/genotypes.vcf" + - f"{dir}temp/rsIDs.gvcf.gz" + + """ + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + ref_1kGP = Path(ref_1kGP) + ref_1kGP_gz = f"{ref_1kGP}.gz" + + if str(vcf_or_bam).lower().endswith('.bam'): + # Define the command to run GATK HaplotypeCaller + output_g_path = f"{output_dir}/{name}.g.vcf" + if not Path(output_g_path).exists(): + runstr = f"java -Xmx40g -Djava.io.tmpdir={output_dir}/tmpdir -jar {gatk}" + runstr += f" -T HaplotypeCaller -R {ref_hs38}" + runstr += f" -I {vcf_or_bam} -nct {numThreads}" + runstr += f" --emitRefConfidence GVCF -L {ref_1kGP}" + runstr += f" -o {output_g_path}" + os.system(runstr) + + # Define the command to run GATK GenotypeGVCFs + output_genotypes_path = f"{output_dir}/{name}.genotypes.vcf" + if not Path(output_genotypes_path).exists(): + # return + runstr2 = f"java -Xmx40g -Djava.io.tmpdir={output_dir}/tmpdir -jar {gatk}" + runstr2 += f" -T GenotypeGVCFs -R {ref_hs38}" + runstr2 += f" --variant {output_g_path}" + runstr2 += f" -allSites -o {output_genotypes_path}" + runstr2 += " -stand_emit_conf 10 -stand_call_conf 30" + os.system(runstr2) + else: + output_genotypes_path = vcf_or_bam + + # Add rsIDs to the VCF file using bcftools and bgzip + output_path = f"{output_dir}/{name}.rsIDs.gvcf.gz" + if not Path(output_path).exists(): + cmd = f"cat {output_genotypes_path} | {bcftools} annotate -a {ref_1kGP_gz} -c CHROM,-,POS,ID | {bgzip} -c > {output_path}" + ret = subprocess.run(cmd, shell=True) + + +def gvcf_to_plink(gvcf_path, output_dir, output_name='sample', plink='software/plink'): + """ + Convert gvcf to plink files (bed, bim, fam) + + Input files: + - {output_dir}/{sample}.ancestry.rsIDs.gvcf.gz + + Output files: + - {output_dir}/{sample}.bed + - {output_dir}/{sample}.bim + - {output_dir}/{sample}.fam + - {output_dir}/{sample}.fam.mod + - {output_dir}/{sample}.log (update) + - {output_dir}/{sample}.nosex + + """ + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + output_path = f"{output_dir}/{output_name}.fam" + if Path(output_path).exists(): + return + + # convert gvcf to plink (generate bed, bim, fam) + runstr1q = f"{plink} --biallelic-only --vcf-require-gt" + runstr1q += f" --vcf {gvcf_path}" + runstr1q += f" --out {output_dir}/{output_name}" + runstr1q += " --allow-extra-chr --make-bed --double-id" + subprocess.run(runstr1q, shell=True) + + # the vcf has the SN (sample name) from the BAM, so when converts that to plink it gives me problems if there is a "_" in the name. + # So I'm changin it and I put the same $sample name as FIID and IID. + # Also, if the IID is not the same as $sample, the R script will not find the sameple and will not be plotted + cmd = f"awk '{{ $1 = \"{output_name}\"; print }}' {output_dir}/{output_name}.fam > {output_dir}/{output_name}.fam.mod" + subprocess.run(cmd, shell=True) + cmd2 = f"awk '{{ $2 = \"{output_name}\"; print }}' {output_dir}/{output_name}.fam.mod > {output_dir}/{output_name}.fam" + subprocess.run(cmd2, shell=True) + + +def merge_pgp_1kGP(plink_result_dir, output_dir, ref_initialAnc='reference/1kGP_GRCh38_exome', plink='software/plink'): + """ + Merge the pgp and 1kGP files and get the SNPs that disagree between datasets + - Four groups of files are generated: + + Input files: + - {plink_result_dir}/{sample}.bed + - {plink_result_dir}/{sample}.bim + - {plink_result_dir}/{sample}.fam + - {plink_result_dir}/{sample}.fam.mod + - initialAnc f"{dir}reference/1kGP_GRCh38_exome" + + Output files: + - {resultsdir}/{sample}_1kGP_0 (-merge.missnp, -merge.fam, .log) + - {resultsdir}/1kGP_2 (.bed, .bim, .fam, .log, .nosex) + - {resultsdir}/{sample}_2 (.bed, .bim, .fam, .log, .nosex) + - {resultsdir}/{sample}_1kGP (.bed, .bim, .fam, .log, .nosex) + + """ + # reusltsdir = f"{resultsdir}/temp" + # output_path = f"{reusltsdir}/{sample}.fam" + # if Path(output_path).exists(): + # return + # Merge the pgp and 1kGP files and get the SNPs that disagree between datasets + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + output_path = f"{output_dir}/1kGP_0-merge.fam" + if not Path(output_path).exists(): + runstr3 = f"{plink} --bfile {plink_result_dir} --bmerge {ref_initialAnc} --out {output_dir}/1kGP_0 --geno 0 --allow-extra-chr --make-bed" + os.system(runstr3) + + # Remove discarded SNPs from the pgp and 1kGP file + output_path = f"{output_dir}/1kGP_2.bed" + if not Path(output_path).exists(): + runstr1z = f"{plink} --bfile {ref_initialAnc} --exclude {output_dir}/1kGP_0-merge.missnp --out {output_dir}/1kGP_2 --make-bed --allow-extra-chr" + os.system(runstr1z) + + output_path = f"{output_dir}/2.bed" + if not Path(output_path).exists(): + runstr1z2 = f"{plink} --bfile {plink_result_dir} --exclude {output_dir}/1kGP_0-merge.missnp --out {output_dir}/2 --make-bed --allow-extra-chr" + os.system(runstr1z2) + + # Merge the pgp and 1kGP files again + output_path = f"{output_dir}/1kGP.bed" + if not Path(output_path).exists(): + runstr3a = f"{plink} --bfile {output_dir}/2 --bmerge {output_dir}/1kGP_2 --out {output_dir}/1kGP --geno 0 --allow-extra-chr --make-bed" + os.system(runstr3a) + + +def subset_unlinked_snps(plink_1kgp_path, output_dir, plink='software/plink'): + """ + Choose a subset of filtered and independent (unlinked) SNPs + + Input files: + - {output_dir}/{sample}_1kGP* (.bed, .bim, .fam, .log, .nosex) + + Output files: + - {output_dir}/snps_to_prune (.log, .prune.in, .prune.out, .nosex) + - {output_dir}/{sample}_1kGP_pruned (.bed, .bim, .fam, .log, .nosex) + + """ + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + # output_dir = f"{resultsdir}/temp" + + # Choose a subset of filtered and independent (unlinked) SNPs + + # Select a subgroup of unlinked SNPs, by pruning those with r2 > 0.1 using 100-SNP windows shifted at 5-SNP intervals. + output_path = f"{output_dir}/snps_to_prune.prune.in" + if not Path(output_path).exists(): + runstr4 = f"{plink} --bfile {plink_1kgp_path} --out {output_dir}/snps_to_prune --indep-pairwise 100 5 0.1" + os.system(runstr4) + + # Remove the pruned SNPs from the dataset and set the missingness threshold to 0.1 + output_path = f"{output_dir}/1kGP_pruned.bed" + if not Path(output_path).exists(): + runstr5 = f"{plink} --bfile {plink_1kgp_path} --out {output_dir}/1kGP_pruned --extract {output_dir}/snps_to_prune.prune.in --make-bed --mind 0.1" + os.system(runstr5) + +def from_1kgp_pruned_to_pca(plink_1kgp_pruned, output_dir, plink='software/plink'): + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + output_path = f"{output_dir}/1kGP_pruned_pca_20.eigenvec" + if not Path(output_path).exists(): + runstr6 = f"{plink}" + runstr6 += f" --bfile {plink_1kgp_pruned}" + runstr6 += f" --out {output_dir}/1kGP_pruned_pca_20" + runstr6 += " --pca" + subprocess.run(runstr6, shell=True) + +def generate_ancestry_pipeline(bam_or_vcf, output_dir, numThreads=4, sample='sample', + dir_reference='reference', dir_software='software', workdir="", + ): + """ + Generate ancestry from BAM file + + Steps: + 1. preprocessed the 1kGP data + 2. Convert BAM to VCF + 3. Convert VCF to PLINK + 4. Merge the pgp and 1kGP files and get the SNPs that disagree between datasets + 5. Choose a subset of filtered and independent (unlinked) SNPs + 6. Run ADMIXTURE + + + Parameters + ---------- + bam + output_dir + numThreads + dir_reference + dir_software + workdir + + Returns + ------- + + """ + + dir = "" + if 'SINGULARITY_NAME' in os.environ: + dir = "/GenomeChronicler/" + + # resultsdir = dir + plink = f"{dir}software/plink" + gatk = f"{dir}software/GenomeAnalysisTK.jar" + ref_hs38 = f"{dir}reference/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa" + initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim" + initialAnc = f"{dir}reference/1kGP_GRCh38_exome" + bcftools = "bcftools" + bgzip = "bgzip" + tabix = "tabix" + hasCHRflag = 0 + + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + + # Also check for the need to samtools index this stuff + # java -jar software/picard.jar CreateSequenceDictionary R=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa O=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.dict + + # ------------------------------------------ + # -- I had already preprocessed the 1kGP data to reduce it to a PLINK file with + # -- only SNPs from exome (914577 total) + # -- But from these, here I remove all the A/T or C/G SNPs to make sure that there + # -- will not be artifacts due to reporting SNPS i different stransd ib both datasets + # -- I also remove sexual chromosomes + # ------------------------------------------ + plink_1kgp_dir = output_dir/'1kGP_GRCh38_nonAT_CG' + get_list_of_snps_from_1kGP(initialBIM, plink_1kgp_dir, hasCHRflag=hasCHRflag, bgzip=bgzip, tabix=tabix) + + # ------------------------------------------ + # -- Convert BAM to a gvcf that contains only + # - the 1kgp snps filtered above + # ------------------------------------------ + ref_1kGP = plink_1kgp_dir/'1kGP_GRCh38_nonAT_CG.bed' + vcf_dir = output_dir/'vcf_ancestry' + get_gvcf(bam_or_vcf, vcf_dir, ref_hs38, ref_1kGP, 'ancestry', gatk, numThreads, bcftools, bgzip) + + # ------------------------------------------ + # -- Convert GVCF data to PLINK format + # ------------------------------------------ + gvcf_path = vcf_dir/f"ancestry.rsIDs.gvcf.gz" + gvcf_to_plink(gvcf_path, vcf_dir, sample, plink) + + # ------------------------------------------ + # -- Merge gvcf with 1kGP data. I need to merge the files twice, the first one will fail if there are SNPs with + # -- different alleles in both datasets. The second merge will exclude these SNPs + # ------------------------------------------ + plink_result_dir = vcf_dir/sample + merge_pgp_1kgp_dir = output_dir/'merged_pgp_1kGP' + merge_pgp_1kGP(plink_result_dir, merge_pgp_1kgp_dir, initialAnc, plink) + + # ------------------------------------------ + # -- Reduce the merged dataset further by removing SPs in LD + # ------------------------------------------ + merge_pgp_1kgp_path = merge_pgp_1kgp_dir/f"1kGP" + subset_unlinked_snps_dir = output_dir/'subset_unlinked_snps' + subset_unlinked_snps(merge_pgp_1kgp_path, subset_unlinked_snps_dir, plink) + + # ------------------------------------------ + # -- Perform PCA + # ------------------------------------------ + plink_result_dir = subset_unlinked_snps_dir/f"1kGP_pruned" + pca_dir = output_dir/'pca_ancestry' + from_1kgp_pruned_to_pca(plink_result_dir, pca_dir, plink) + + +funcs = { + 'get_list_of_snps_from_1kGP': get_list_of_snps_from_1kGP, + 'get_gvcf': get_gvcf, + 'gvcf_to_plink': gvcf_to_plink, + 'merge_pgp_1kGP': merge_pgp_1kGP, + 'subset_unlinked_snps': subset_unlinked_snps, + 'from_1kgp_pruned_to_pca': from_1kgp_pruned_to_pca, + 'generate_ancestry_pipeline': generate_ancestry_pipeline, +} + + +class Tools(object): + def __init__(self): + super(Tools, self).__init__() + + +if __name__ == '__main__': + for k, v in funcs.items(): + setattr(Tools, k, staticmethod(v)) + fire.Fire(Tools) diff --git a/scripts/GenomeChronicler_genoTables_fromAfoGeno.py b/scripts/genotables_from_afogeno.py similarity index 80% rename from scripts/GenomeChronicler_genoTables_fromAfoGeno.py rename to scripts/genotables_from_afogeno.py index c55efbd..2de75b5 100755 --- a/scripts/GenomeChronicler_genoTables_fromAfoGeno.py +++ b/scripts/genotables_from_afogeno.py @@ -8,11 +8,11 @@ import pandas as pd from tqdm import tqdm -def ucfirst(s): - if len(s) > 0: - return s[0].upper() + s[1:] - else: - return s +if __name__ == '__main__': + import sys + sys.path.append(os.path.abspath(os.getcwd())) +from scripts.utils import ucfirst + def processGenoset(genoset, genosets, genotypes, positiveGenosets): # example: logic = ((( $v[0] + $v[1] + $v[2] + $v[3] + $v[4] ) >= 1) and (!$v[5] and !$v[6] and !$v[7] and !$v[8] and !$v[9])) @@ -140,40 +140,62 @@ def generate_ClinVar_url(rsid, sth5, sth5_sql): return "" -def geno_tables_from_afogeno(filename, outdir='./', verbose=False): - version = "22-146" +def geno_tables_from_afogeno(afogeno_path, output_dir, verbose=False, version="22-146", reference_dir="reference"): + """ + This function generates the geno tables from the afogeno output. + + Inputs: + afogeno_path: path to the afogeno output + reference_dir: path to the reference directory + snpedia.db + gnomad.db + getevidence.db + clinvar.db + genosetDependencies.txt + parsedGenosets.txt + Outputs: + latest.good.reportTable.csv" + latest.bad.reportTable.csv + latest.genoset.reportTable.csv + + Parameters + ---------- + afogeno_path + output_dir + verbose + version + reference_dir + + Returns + ------- + + """ + # dir = "" + # if "SINGULARITY_NAME" in os.environ: + # dir = "/GenomeChronicler/" + + reference_dir = Path(reference_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + temp_dir = output_dir/"temp" + temp_dir.mkdir(parents=True, exist_ok=True) # Note: It is quite clear this code can be optimised tmpBlacklist = {"rs10156191": 1, "rs12094543": 1, "rs1667255": 1, "rs17672135": 1, "rs6445975": 1, "rs7659604": 1} print(f"This is script [{__file__}] part of the report generator pipeline version [{version}];\n") - Path(outdir).mkdir(parents=True, exist_ok=True) - # Sort out input - # if len(sys.argv) != 3: - # print("Usage: {0} MassagedGenotypeVCF OutputDirectory".format(sys.argv[0])) - # sys.exit(1) - - # filename = sys.argv[1] - # outdir = "./" - # if len(sys.argv) > 2: - # outdir = sys.argv[2] - - dir = "" - if "SINGULARITY_NAME" in os.environ: - dir = "/GenomeChronicler/" - # Connect to databases - snpedia_db = sqlite3.connect(f"{dir}reference/snpedia.db") + snpedia_db = sqlite3.connect(reference_dir/f"snpedia.db") print("Opened database successfully [SNPedia]") - gnomad_db = sqlite3.connect(f"{dir}reference/gnomad.db") + gnomad_db = sqlite3.connect(reference_dir/f"gnomad.db") print("Opened database successfully [GnomAD]") - getevidence_db = sqlite3.connect(f"{dir}reference/getevidence.db") + getevidence_db = sqlite3.connect(reference_dir/f"getevidence.db") print("Opened database successfully [GetEvidence]") - clinvar_db = sqlite3.connect(f"{dir}reference/clinvar.db") + clinvar_db = sqlite3.connect(reference_dir/f"clinvar.db") print("Opened database successfully [ClinVar]") # Prepare statements @@ -204,7 +226,7 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False): # Input Genoset Data allGenosets = [] - with open(f"{dir}reference/genosetDependencies.txt", "r") as genoset_deps_file: + with open(reference_dir/f"genosetDependencies.txt", "r") as genoset_deps_file: for line in genoset_deps_file: line = line.strip() if not line: @@ -213,7 +235,7 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False): genosets = {} genotypes = {} - with open(f"{dir}reference/parsedGenosets.txt", "r") as parsed_genosets_file: + with open(reference_dir/f"parsedGenosets.txt", "r") as parsed_genosets_file: for line in parsed_genosets_file: line = line.strip() if not line: @@ -225,15 +247,15 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False): # output_path = f'{outdir}/latest.good.reportTable.csv' # if not Path(output_path).exists(): if True: - IN = open(filename) + IN = open(afogeno_path) rows_GOOD = [] rows_BAD = [] - Path(f"{outdir}/latest.good.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n") - Path(f"{outdir}/latest.bad.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n") - proc_good = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {outdir}/latest.good.reportTable.csv", - shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) - proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {outdir}/latest.bad.reportTable.csv", - shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) + Path(f"{temp_dir}/latest.good.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n") + Path(f"{temp_dir}/latest.bad.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n") + proc_good = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.good.reportTable.csv", + shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) + proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.bad.reportTable.csv", + shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) for line in tqdm(IN, disable=not verbose): # for line in IN: @@ -388,10 +410,10 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False): print("\t +++ POSITIVES:\n", positiveGenosets) - with open(f"{outdir}/latest.genoset.reportTable.csv", "w") as geno: + with open(f"{output_dir}/latest.genoset.reportTable.csv", "w") as geno: geno.write("Magnitude,Identifier,Summary\n") - sort_process = subprocess.Popen(f"sort -t \",\" -k 1,1nr >> {outdir}/latest.genoset.reportTable.csv", stdin=subprocess.PIPE, shell=True) + sort_process = subprocess.Popen(f"sort -t \",\" -k 1,1nr >> {output_dir}/latest.genoset.reportTable.csv", stdin=subprocess.PIPE, shell=True) # proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {outdir}/latest.bad.reportTable.csv", # shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) for geno in sorted(positiveGenosets.keys()): @@ -409,6 +431,46 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False): print("Operation done successfully") snpedia_db.close() + filter_final_report_csv(f"{temp_dir}/latest.good.reportTable.csv", f"{output_dir}/latest.good.reportTable.csv") + filter_final_report_csv(f"{temp_dir}/latest.bad.reportTable.csv", f"{output_dir}/latest.bad.reportTable.csv") + + +def filter_final_report_csv(csv_input_path, csv_output_path=None, DEBUG=False): + ''' + + ''' + # read csv + df = pd.read_csv(csv_input_path,dtype={'Mag.':'str'}) + + # filter + if len(df.columns) <= 4: + if DEBUG: + print("WARNING: No links found") + return + df = df[~(df['Mag.'].astype('float')==0)] + df = df[~(df['Identifier'].str.startswith('{rs')|df['Identifier'].str.startswith('{i'))] + + # export + if csv_output_path is None: # inplace + # warning, inplace operation + csv_output_path = csv_input_path + Path(csv_output_path).parent.mkdir(parents=True, exist_ok=True) + df.to_csv(csv_output_path, index=False) + + + +funcs = { + 'geno_tables_from_afogeno': geno_tables_from_afogeno, + 'filter_final_report_csv': filter_final_report_csv, +} + + +class Tools(object): + def __init__(self): + super(Tools, self).__init__() + if __name__ == '__main__': - fire.Fire(geno_tables_from_afogeno) + for k, v in funcs.items(): + setattr(Tools, k, staticmethod(v)) + fire.Fire(Tools) diff --git a/scripts/GenomeChronicler_XLSX_fromTables.py b/scripts/io.py similarity index 51% rename from scripts/GenomeChronicler_XLSX_fromTables.py rename to scripts/io.py index 40365bf..50b0428 100644 --- a/scripts/GenomeChronicler_XLSX_fromTables.py +++ b/scripts/io.py @@ -1,17 +1,17 @@ #!/usr/bin/env python3 -# import sys -# import os import csv +import os +import shutil +import subprocess from pathlib import Path -# import pandas as pd import fire import xlsxwriter def export_genotypes_xlsx_from_csvs(csv_dir, xlsx_output_path, - csv_path_good='latest.good.reportTable.csv', + csv_path_good='latest.good.reportTable.csv', csv_path_bad='latest.bad.reportTable.csv', csv_path_genoset='latest.genoset.reportTable.csv', ): @@ -101,5 +101,105 @@ def export_genotypes_xlsx_from_csvs(csv_dir, xlsx_output_path, workbook.close() +def render_latex(tables_dir, sample_name, output_dir, + plot_path=None, vep_dir=None, + template_dir="templates", latex_template=None, + ): + """ + Render the latex template with the tables and the version table + + Template + - + + Latex render files: + Common: + latexTemplate.tex + versionTable.txt + GeneStructure.pdf + + Personal: + SampleName.txt + latest.good.reportTable.csv + latest.bad.reportTable.csv + latest.genoset.reportTable.csv + + with VEP: + latest.summary.csv + latest.polyphen_table.csv + latest.var_class_table.csv + latest.var_cons_table.csv + + Parameters + ---------- + tables_dir + sample_name + output_dir + + Returns + ------- + + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + template_dir = Path(template_dir) + + if latex_template is None: + if vep_dir is not None: + latex_template = template_dir/f"reportTemplate_withVEP.tex" + else: + latex_template = template_dir/f"reportTemplate_ohneVEP.tex" + + # Prepare files for latex rendering + ## Main file + latex_main_name = f"{sample_name}_report" + shutil.copy(latex_template, f"{output_dir}/{latex_main_name}.tex") + + ## Common files + shutil.copy(template_dir/f"versionTable.txt", output_dir) + shutil.copy(template_dir/f"GeneStructure.pdf", output_dir) + + ## Personal files + with open(f"{output_dir}/SampleName.txt", "w") as f: + f.write(sample_name) + tables_dir = Path(tables_dir) + shutil.copy(tables_dir/f"latest.good.reportTable.csv", output_dir) + shutil.copy(tables_dir/f"latest.bad.reportTable.csv", output_dir) + shutil.copy(tables_dir/f"latest.genoset.reportTable.csv", output_dir) + if plot_path is not None: + shutil.copy(plot_path, output_dir/'AncestryPlot.pdf') + + ## VEP files + if vep_dir is not None: + vep_dir = Path(vep_dir) + shutil.copy(vep_dir/f"latest.summary.csv", output_dir) + shutil.copy(vep_dir/f"latest.polyphen_table.csv", output_dir) + shutil.copy(vep_dir/f"latest.var_class_table.csv", output_dir) + shutil.copy(vep_dir/f"latest.var_cons_table.csv", output_dir) + + + # Render latex + cwd = os.getcwd() + os.chdir(f"{output_dir}") + for _ in range(3): + cmd = f"pdflatex -interaction=nonstopmode {latex_main_name}.tex 2> /dev/null > /dev/null" + ret = subprocess.run(cmd, shell=True) + os.chdir(cwd) + + +funcs = { + 'export_genotypes_xlsx_from_csvs': export_genotypes_xlsx_from_csvs, + 'render_latex': render_latex, +} + + +class Tools(object): + def __init__(self): + super(Tools, self).__init__() + + if __name__ == '__main__': - fire.Fire(export_genotypes_xlsx_from_csvs) + for k, v in funcs.items(): + setattr(Tools, k, staticmethod(v)) + fire.Fire(Tools) + + diff --git a/scripts/GenomeChronicler_plot_generator_fromAncestry.R b/scripts/plot_generator_fromAncestry.R similarity index 97% rename from scripts/GenomeChronicler_plot_generator_fromAncestry.R rename to scripts/plot_generator_fromAncestry.R index b60ebae..1fcb719 100755 --- a/scripts/GenomeChronicler_plot_generator_fromAncestry.R +++ b/scripts/plot_generator_fromAncestry.R @@ -1,11 +1,11 @@ library(RColorBrewer) library(TeachingDemos) -sample <- as.character(Sys.getenv("SAMPLE")) +pca_path <- as.character(Sys.getenv("PCA_PATH")) id <- as.character(Sys.getenv("ID")) -dir <- as.character(Sys.getenv("DIR")) +output_dir <- as.character(Sys.getenv("OUTPUT_DIR")) -data <- read.table(paste0(dir,"/results/results_",sample,"/temp/",sample, "_1kGP_pruned_pca_20.eigenvec"), header=F) +data <- read.table(pca_path, header=F) qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] mypalette = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))) @@ -156,7 +156,7 @@ for(i in 1:length(zoom_data[,1])){ } #pdf(paste0("./results/results_",sample,"/",sample, "_ancestry_pca.pdf")) -pdf(paste0(dir,"/results/results_",sample,"/AncestryPlot.pdf")) +pdf(paste0(output_dir,"/AncestryPlot.pdf")) plot(data[, 3] + 0.5 * data[, 4] - 0.5 * data[, 5], sin(pi / 3) * data[,4] + sin(pi / 3) * data[, 5], col = all.colors, pch = 19, cex = 0.3, xaxt='n', yaxt='n', xlab="", ylab="", main=paste0("Ancestry ",id)) points(data[pgp, 3] + 0.5 * data[pgp, 4] - 0.5 * data[pgp, 5], sin(pi / 3) * data[pgp,4] + sin(pi / 3) * data[pgp, 5], col = "black", pch = 16, cex = 1) @@ -181,5 +181,3 @@ cexs <- c(rep(0.5, nrow(zoom_data) - 2), 1.2, 1.2) subplot(plot(zoom_data[, 3] + 0.5 * zoom_data[, 4] - 0.5 * zoom_data[, 5], sin(pi / 3) * zoom_data[,4] + sin(pi / 3) * zoom_data[, 5], col = zoom.colors, pch = pchs, cex = cexs, xlab="", ylab="", xaxt='n', yaxt='n', xlim=c(bottom_x, top_x), ylim=c(bottom_y,top_y)), x = "topleft", size = c(1.7,1.7)) dev.off() - - diff --git a/scripts/GenomeChronicler_plot_generator_fromAncestry.py b/scripts/plot_generator_fromAncestry.py similarity index 99% rename from scripts/GenomeChronicler_plot_generator_fromAncestry.py rename to scripts/plot_generator_fromAncestry.py index 6309273..2f254eb 100755 --- a/scripts/GenomeChronicler_plot_generator_fromAncestry.py +++ b/scripts/plot_generator_fromAncestry.py @@ -1,11 +1,9 @@ -import os -import re from pathlib import Path import fire +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt import seaborn as sns from matplotlib import patches diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..3176ca8 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,65 @@ +import subprocess +from pathlib import Path + +import fire + + +def clean_bam_file_noCHR(bam_path, output_dir=None): + """ + Reheader the BAM file and create an index + - Remove the "chr" prefix from the chromosome names + + Input: .bam + Output: .clean.bam, .clean.bam.bai + + Parameters + ---------- + bam_path: str or Path + BAM file path + output_dir: str, Path or None + - If None, the output file will be saved in the same directory as the input file + - Else, the output file will be saved in the specified directory + + Returns + ------- + None + + """ + # Reheader the BAM file and create an index + bam_path = Path(bam_path) + if output_dir is None: + output_dir = bam_path.parent + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + output_path = output_dir/ f"{bam_path.stem}.clean.BAM" + subprocess.run( + f"samtools reheader -c 'perl -pe \"s/^(@SQ.*)(\tSN:)chr/\$1\$2/\"' {bam_path} > {output_path}", + shell=True) + + print(f"\t +++ INFO: Indexing the BAM file") + subprocess.run(f"samtools index -@ 6 {output_path}", shell=True) + + + +funcs = { + 'clean_bam_file_noCHR': clean_bam_file_noCHR, +} + + +class UtilsTools(object): + def __init__(self): + super(UtilsTools, self).__init__() + + +if __name__ == '__main__': + for k, v in funcs.items(): + setattr(UtilsTools, k, staticmethod(v)) + fire.Fire(UtilsTools) + + +def ucfirst(s): + if len(s) > 0: + return s[0].upper() + s[1:] + else: + return s diff --git a/scripts/GenomeChronicler_vepTables_fromVEP.py b/scripts/vep_tables_from_vep.py similarity index 80% rename from scripts/GenomeChronicler_vepTables_fromVEP.py rename to scripts/vep_tables_from_vep.py index e3e11fa..d1ef4f2 100755 --- a/scripts/GenomeChronicler_vepTables_fromVEP.py +++ b/scripts/vep_tables_from_vep.py @@ -1,39 +1,29 @@ #!/usr/bin/env python3 - -import subprocess -import sys +import os import re +import subprocess from pathlib import Path import fire -def ucfirst(s): - if len(s) > 0: - return s[0].upper() + s[1:] - else: - return s - -def get_vepTables_from_VEP(filename, out_dir="."): +if __name__ == '__main__': + import sys + sys.path.append(os.path.abspath(os.getcwd())) - # # Sort out input - # if len(sys.argv) != 3: - # sys.exit("Usage: {} VEPhtml OutputDir".format(sys.argv[0])) +from scripts.utils import ucfirst - # filename = sys.argv[1] - # out_dir = sys.argv[2] - # if not os.path.exists(filename): - # sys.exit("Major error here (File not found)") +def get_vep_tables_from_vep(vep_path, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) - out_file = open("{}/latest.summary.csv".format(out_dir), "w") + out_file = open(f"{output_dir}/latest.summary.csv", "w") out_file.write("Feature\tCount\n") recording = 0 - # with (filename, "r") as in_file: - with subprocess.Popen(f'cat {filename} | grep "gen_stats"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc: + with subprocess.Popen(f'cat {vep_path} | grep "gen_stats"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc: for line in proc.stdout: line = line.decode() - # line = line.strip() line = line.rstrip("\n") if not line: continue @@ -70,7 +60,7 @@ def get_vepTables_from_VEP(filename, out_dir="."): # in_file.close() out_file.close() - with open(filename, "r") as in_file: + with open(vep_path, "r") as in_file: for line in in_file: line = line.strip() if not line or "drawTable" not in line or "[" not in line: @@ -97,7 +87,7 @@ def get_vepTables_from_VEP(filename, out_dir="."): counter += 1 - out_file = open("{}/latest.{}.csv".format(out_dir, tab_name), "w") + out_file = open("{}/latest.{}.csv".format(output_dir, tab_name), "w") head = table.pop(0) head[0] = "Label" @@ -121,4 +111,4 @@ def get_vepTables_from_VEP(filename, out_dir="."): if __name__ == '__main__': - fire.Fire(get_vepTables_from_VEP) + fire.Fire(get_vep_tables_from_vep) From bc35b56a04ba4b4ecef6f7943606b3845dcd088a Mon Sep 17 00:00:00 2001 From: liuxh Date: Mon, 28 Aug 2023 20:00:04 +0100 Subject: [PATCH 2/2] front jsons dump --- main.py | 82 +++---- scripts/genotables_from_afogeno.py | 312 +++++++++++++------------ scripts/plot_generator_fromAncestry.py | 1 + scripts/to_front_jsons.py | 197 ++++++++++++++++ scripts/utils.py | 109 ++++++++- scripts/vep_tables_from_vep.py | 114 --------- 6 files changed, 501 insertions(+), 314 deletions(-) create mode 100755 scripts/to_front_jsons.py delete mode 100755 scripts/vep_tables_from_vep.py diff --git a/main.py b/main.py index 7098656..15e44ba 100644 --- a/main.py +++ b/main.py @@ -12,8 +12,9 @@ from scripts.genotables_from_afogeno import geno_tables_from_afogeno from scripts.io import render_latex, export_genotypes_xlsx_from_csvs from scripts.plot_generator_fromAncestry import plot_generator_from_ancestry -from scripts.utils import clean_bam_file_noCHR -from scripts.vep_tables_from_vep import get_vep_tables_from_vep +from scripts.to_front_jsons import vep_labex_tables_to_jsons, geno_tables_to_jsons +from scripts.utils import clean_bam_file_noCHR, get_vep_tables_from_vep + def check_path_exist(path, file_type, error_template="--- ERROR: The {file_type} specified in the command line wasn't found [{" @@ -94,14 +95,11 @@ def main(): if "SINGULARITY_NAME" in os.environ: dir = "/GenomeChronicler/" - resultsdir = os.getcwd() - # Defining input options and their default values... parser = get_parser() args = parser.parse_args() # conf_file = None - debugFlag = args.debug BAM_file = args.BAM_file gVCF_file = args.gVCF_file VEP_file = args.VEP_file @@ -142,15 +140,8 @@ def main(): sample = None if BAM_file: sample = Path(BAM_file).name.split('.')[0] - # sample = os.path.splitext(os.path.basename(BAM_file))[0] - # sample = re.sub(r'\.recal', '', sample) - # sample = re.sub(r'\.bam\.clean', '', sample, flags=re.IGNORECASE) elif gVCF_file: sample = Path(gVCF_file).name.split('.')[0] - # sample = os.path.splitext(os.path.basename(gVCF_file))[0] - # sample = re.sub(r'\.g\.', '.', sample, flags=re.IGNORECASE) - # sample = re.sub(r'\.g\.vcf', '', sample, flags=re.IGNORECASE) - # sample = re.sub(r'\.vcf', '', sample, flags=re.IGNORECASE) else: print_header_ascii() print("\t --- MAJOR ERROR: No BAM or gVCF file found. Please check the usage notes above and try again ---\n", @@ -158,14 +149,6 @@ def main(): sys.exit(555) output_dir = Path(resultsdir) - temp_dir = output_dir/"temp" - temp_dir.mkdir(parents=True, exist_ok=True) - - # os.system(f"mkdir -p {temp_dir}") - # check_path_exist(temp_dir, "Results directory", exit_code=102, - # error_template="\t --- ERROR: Results directory [{path}] can't be found and could not be " - # "created. Please check permissions and try again ---\n") - LOGFILE2 = f"{output_dir}/{sample}.processingLog.stderr.txt" # Main bit of code @@ -181,12 +164,15 @@ def main(): output_vep_dir = output_dir/"vep_dir" get_vep_tables_from_vep(VEP_file, output_vep_dir) + clean_bam_dir = None if BAM_file: print("\t +++ INFO: Preprocessing BAM file") - output_path = temp_dir/f"{sample}.clean.BAM" + clean_bam_dir = output_dir/"clean_bam" + clean_bam_dir.mkdir(parents=True, exist_ok=True) + output_path = clean_bam_dir/f"{sample}.clean.BAM" if not output_path.exists(): - clean_bam_file_noCHR(BAM_file, temp_dir) + clean_bam_file_noCHR(BAM_file, clean_bam_dir) input_path = output_path elif gVCF_file: @@ -210,9 +196,7 @@ def main(): if not Path(output_plot_path).exists(): # cmd = f"PCA_PATH={pca_eigenvec_path} ID={sample} OUTPUT_DIR={output_plot_path} R CMD BATCH {dir}scripts/plot_generator_fromAncestry.R" # ret = subprocess.run(cmd, shell=True) - # print(ret) plot_generator_from_ancestry(pca_eigenvec_path, sample, output_dir) - # exit() print("\t +++ INFO: Generating Genotypes Files") output_afogeno_dir = output_dir/"afogeno" @@ -225,8 +209,10 @@ def main(): print("\t +++ INFO: Generating Genome Report Tables") output_tables_dir = output_dir/"tables" - geno_tables_from_afogeno(afogeno_file, output_tables_dir) - for path in output_tables_dir.glob("*.csv"): + geno_tables_from_afogeno(afogeno_file, output_tables_dir, truncate=True) + output_tables_full_dir = output_dir/"tables_full" + geno_tables_from_afogeno(afogeno_file, output_tables_full_dir, truncate=False) + for path in output_tables_full_dir.glob("*.csv"): shutil.copy(path, output_dir) print("\t +++ INFO: Combining Excel Tables") @@ -249,31 +235,27 @@ def main(): latex_pdf_path = run_latex_dir/f"{sample}_report.pdf" shutil.copy(latex_pdf_path, output_dir) # copy to main results dir + # dump front jsons + output_front_json_dir = output_dir/"front_jsons" + geno_tables_to_jsons(output_tables_full_dir, output_dir=output_front_json_dir) + vep_labex_tables_to_jsons(output_vep_dir, latex_dir=run_latex_dir, output_dir=output_front_json_dir) + + if no_clean_temporary_files is False: + print("\t +++ INFO: Cleaning up Temporary and Intermediate Files") + + if clean_bam_dir is not None: + shutil.rmtree(clean_bam_dir, ignore_errors=True) + + shutil.rmtree(output_ancestry_dir, ignore_errors=True) + shutil.rmtree(output_afogeno_dir, ignore_errors=True) + # shutil.rmtree(output_tables_dir, ignore_errors=True) + # shutil.rmtree(output_tables_full_dir, ignore_errors=True) + shutil.rmtree(run_latex_dir, ignore_errors=True) + + if output_vep_dir is not None: + shutil.rmtree(output_vep_dir, ignore_errors=True) - # if no_clean_temporary_files is False: - # print("\t +++ INFO: Cleaning up Temporary and Intermediate Files") - # if BAM_file is not None: - # os.remove(BAM_file) - # os.remove(f"{BAM_file}.bai") - # - # shutil.rmtree(f"{temp_dir}") - # for file in Path(f"{output_dir}/").glob("latest*.csv"): - # os.remove(file) - # - # subprocess.run(f"rm -rf {output_dir}/versionTable.txt", shell=True) - # subprocess.run(f"rm -rf {output_dir}/GeneStructure.pdf", shell=True) - # subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.out", shell=True) - # subprocess.run(f"rm -rf {output_dir}/texput.log", shell=True) - # subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.aux", shell=True) - # subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.log", shell=True) - # subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.tex", shell=True) - # subprocess.run(f"rm -rf {dir}GenomeChronicler_plot_generator_fromAncestry.Rout", shell=True) - - # # time.sleep(1) - # if BAM_file is not None: - # BAM_file = BAM_file.replace(".clean.BAM", "") - # print( - # f"\n\t +++ DONE: Finished GenomeChronicler for file [ {BAM_file} ] in {datetime.now() - start_time} seconds") + print(f"\t +++ DONE: Finished GenomeChronicler for file [ {input_path} ] in {datetime.now() - start_time} seconds") if __name__ == '__main__': diff --git a/scripts/genotables_from_afogeno.py b/scripts/genotables_from_afogeno.py index 2de75b5..a9c7ed7 100755 --- a/scripts/genotables_from_afogeno.py +++ b/scripts/genotables_from_afogeno.py @@ -140,7 +140,7 @@ def generate_ClinVar_url(rsid, sth5, sth5_sql): return "" -def geno_tables_from_afogeno(afogeno_path, output_dir, verbose=False, version="22-146", reference_dir="reference"): +def geno_tables_from_afogeno(afogeno_path, output_dir, verbose=False, version="22-146", reference_dir="reference", truncate=True): """ This function generates the geno tables from the afogeno output. @@ -246,159 +246,183 @@ def geno_tables_from_afogeno(afogeno_path, output_dir, verbose=False, version="2 # output_path = f'{outdir}/latest.good.reportTable.csv' # if not Path(output_path).exists(): - if True: - IN = open(afogeno_path) - rows_GOOD = [] - rows_BAD = [] - Path(f"{temp_dir}/latest.good.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n") - Path(f"{temp_dir}/latest.bad.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n") - proc_good = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.good.reportTable.csv", - shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) - proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.bad.reportTable.csv", - shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) - - for line in tqdm(IN, disable=not verbose): - # for line in IN: - line = line.strip() - if line == "": - continue - if line.startswith("#"): + + IN = open(afogeno_path) + # rows_GOOD = [] + # rows_BAD = [] + # Path(f"{temp_dir}/latest.good.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n") + # Path(f"{temp_dir}/latest.bad.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n") + # proc_good = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.good.reportTable.csv", + # shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) + # proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.bad.reportTable.csv", + # shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) + entries_good = [] + entries_bad = [] + for line in tqdm(IN, disable=not verbose): + # for line in IN: + line = line.strip() + if line == "": + continue + if line.startswith("#"): + continue + + debugBuffer = "" + + data = line.split("\t") + if len(data) < 12: + data.append("") + + debugBuffer += "\n+++\n" + "\t".join(data) + "\n" + + strand = "plus" + snp = data[3] + + # rvS = sth1s.execute(snp) + rvS = sth1s.execute(sth1s_sql, (snp,)) + counter = 0 + for row in sth1s.fetchall(): + strand = row[1] + counter += 1 + if counter > 1: + raise Exception("Oh my Gawd!!! DNA has two strands (and it isn't a good thing in this case)") + + # rvFlag = sth1f.execute(snp) + rvS = sth1f.execute(sth1f_sql, (snp,)) + flagID = 0 + for row in sth1f.fetchall(): + flagID = 1 + if flagID == 1 and (data[8] == "0/0" or data[8] == "./."): + continue + + # strand + if strand == "minus": + data[9] = data[9].translate(str.maketrans("ACTGactg", "TGACtgac")) + data[10] = data[10].translate(str.maketrans("ACTGactg", "TGACtgac")) + + genotype = [data[9], data[10]] + extra = data[11] + + genotypes[snp] = {} + if extra != "": + genotypes[snp][extra] = 1 + genotypes[snp][genotype[0]] = 1 + genotypes[snp][genotype[1]] = 1 + genotypes[snp][";".join(genotype)] = 1 + + try: + rv = sth1c.execute(sth1c_sql, (snp,)) + except Exception as e: + print(data) + print(e) + continue + alleles = {} + countGenotype = 0 + for row in sth1c.fetchall(): + debugBuffer += "ID = " + str(row[0]) + "\n" + debugBuffer += "REPUTE = " + str(row[1]) + "\n" + debugBuffer += "MAGNITUDE = " + str(row[2]) + "\n" + debugBuffer += "ALLELE1 = " + str(row[3]) + "\n" + debugBuffer += "ALLELE2 = " + str(row[4]) + "\n" + debugBuffer += "SUMMARY = " + str(row[5]) + "\n" + debugBuffer += "RSID = " + str(row[6]) + "\n" + debugBuffer += "IID = " + str(row[7]) + "\n" + + row = list(row) + row.append(row[5]) + row[5] = cleanSummaryString(row[5]) + # if truncate: + # row[5] = cleanSummaryString(row[5]) + # if "," in row[5]: + # row[5] = f'"{row[5]}"' + + a1 = row[3] + a2 = row[4] + + # if row[2] is None or row[2] == 0: + if row[2] is None or row[2] == 0 or row[2] == "0": # Excluding magnitude 0 continue - debugBuffer = "" + alleles.setdefault(a1, {})[a2] = [row[1], row[0], row[2], f"({a1};{a2})", row[5], row[-1]] + alleles.setdefault(a2, {})[a1] = alleles[a1][a2] - data = line.split("\t") - if len(data) < 12: - data.append("") + countGenotype += 1 - debugBuffer += "\n+++\n" + "\t".join(data) + "\n" - strand = "plus" - snp = data[3] - # rvS = sth1s.execute(snp) - rvS = sth1s.execute(sth1s_sql, (snp,)) - counter = 0 - for row in sth1s.fetchall(): - strand = row[1] - counter += 1 - if counter > 1: - raise Exception("Oh my Gawd!!! DNA has two strands (and it isn't a good thing in this case)") + if countGenotype: + if genotype[0] in alleles and genotype[1] in alleles[genotype[0]]: - # rvFlag = sth1f.execute(snp) - rvS = sth1f.execute(sth1f_sql, (snp,)) - flagID = 0 - for row in sth1f.fetchall(): - flagID = 1 - if flagID == 1 and (data[8] == "0/0" or data[8] == "./."): - continue + geno_desc = alleles[genotype[0]][genotype[1]][4] + geno_desc_full = alleles[genotype[0]][genotype[1]][5] + #Implement some filters here + if re.findall(r"^\s*$|^Common|^None|^Normal|^Average|^Benign most likely|^Ancestral value|^Unaffected Genotype", geno_desc, re.IGNORECASE): + continue + if re.findall(r"^Benign variant|^L22- S142-|^Depends on|^Extensive metabolizer", geno_desc, re.IGNORECASE): + continue + if re.findall(r"^Typical BuChE|Complex; generally normal risk|common in clinvar|Major allele, normal risk|Slight if any|Likely to be benign|Most common genotype", geno_desc, re.IGNORECASE): + continue + if re.findall(r"^Most likely a benign polymorphism|Benign \(harmless\) variant|^1.3x risk$", geno_desc, re.IGNORECASE): + continue + if re.findall(r"normal risk of migraine|More likely to go bald; common|Most likely benign polymorphism|Slightly increased lifespan?|^Benign polymorphism", geno_desc, re.IGNORECASE): + continue + if re.findall(r"No increased risk of |Likely to be a benign variant|Carrier of a benign change|Classified as benign in ClinVar", geno_desc, re.IGNORECASE): + continue - # strand - if strand == "minus": - data[9] = data[9].translate(str.maketrans("ACTGactg", "TGACtgac")) - data[10] = data[10].translate(str.maketrans("ACTGactg", "TGACtgac")) - - genotype = [data[9], data[10]] - extra = data[11] - - genotypes[snp] = {} - if extra != "": - genotypes[snp][extra] = 1 - genotypes[snp][genotype[0]] = 1 - genotypes[snp][genotype[1]] = 1 - genotypes[snp][";".join(genotype)] = 1 - - try: - rv = sth1c.execute(sth1c_sql, (snp,)) - except Exception as e: - print(data) - print(e) - continue - alleles = {} - countGenotype = 0 - for row in sth1c.fetchall(): - debugBuffer += "ID = " + str(row[0]) + "\n" - debugBuffer += "REPUTE = " + str(row[1]) + "\n" - debugBuffer += "MAGNITUDE = " + str(row[2]) + "\n" - debugBuffer += "ALLELE1 = " + str(row[3]) + "\n" - debugBuffer += "ALLELE2 = " + str(row[4]) + "\n" - debugBuffer += "SUMMARY = " + str(row[5]) + "\n" - debugBuffer += "RSID = " + str(row[6]) + "\n" - debugBuffer += "IID = " + str(row[7]) + "\n" - - row = list(row) - row[5] = cleanSummaryString(row[5]) - - a1 = row[3] - a2 = row[4] - - # if row[2] is None or row[2] == 0: - if row[2] is None or row[2] == 0 or row[2] == "0": # Excluding magnitude 0 + rsid = alleles[genotype[0]][genotype[1]][1] + gnomAD = generate_GnomAD_url(rsid, sth3, sth3_sql) + getevidence = generate_get_evidence_url(rsid, sth4, sth4_sql) + snpedia = generate_SNPedia_url(rsid) + clinvar = generate_ClinVar_url(rsid, sth5, sth5_sql) + + if gnomAD == "" and getevidence == "" and clinvar == "": continue - alleles.setdefault(a1, {})[a2] = [row[1], row[0], row[2], f"({a1};{a2})", row[5]] - alleles.setdefault(a2, {})[a1] = alleles[a1][a2] - - countGenotype += 1 - - - - if countGenotype: - if genotype[0] in alleles and genotype[1] in alleles[genotype[0]]: - - geno_desc = alleles[genotype[0]][genotype[1]][4] - #Implement some filters here - if re.findall(r"^\s*$|^Common|^None|^Normal|^Average|^Benign most likely|^Ancestral value|^Unaffected Genotype", geno_desc, re.IGNORECASE): - continue - if re.findall(r"^Benign variant|^L22- S142-|^Depends on|^Extensive metabolizer", geno_desc, re.IGNORECASE): - continue - if re.findall(r"^Typical BuChE|Complex; generally normal risk|common in clinvar|Major allele, normal risk|Slight if any|Likely to be benign|Most common genotype", geno_desc, re.IGNORECASE): - continue - if re.findall(r"^Most likely a benign polymorphism|Benign \(harmless\) variant|^1.3x risk$", geno_desc, re.IGNORECASE): - continue - if re.findall(r"normal risk of migraine|More likely to go bald; common|Most likely benign polymorphism|Slightly increased lifespan?|^Benign polymorphism", geno_desc, re.IGNORECASE): - continue - if re.findall(r"No increased risk of |Likely to be a benign variant|Carrier of a benign change|Classified as benign in ClinVar", geno_desc, re.IGNORECASE): - continue - - rsid = alleles[genotype[0]][genotype[1]][1] - gnomAD = generate_GnomAD_url(rsid, sth3, sth3_sql) - getevidence = generate_get_evidence_url(rsid, sth4, sth4_sql) - snpedia = generate_SNPedia_url(rsid) - clinvar = generate_ClinVar_url(rsid, sth5, sth5_sql) - - if gnomAD == "" and getevidence == "" and clinvar == "": - continue - - if rsid in tmpBlacklist: - continue - - alleles[genotype[0]][genotype[1]][1] = snpedia - alleles[genotype[0]][genotype[1]][4] = generate_Summary_url(rsid, - alleles[genotype[0]][genotype[1]][ - 4]) - tmpString = ",".join( - [alleles[genotype[0]][genotype[1]][2], alleles[genotype[0]][genotype[1]][1], - alleles[genotype[0]][genotype[1]][3], alleles[genotype[0]][genotype[1]][4]]) - tmpString = tmpString.encode('ascii', 'ignore').decode() # Removing non-ASCII characters - if alleles[genotype[0]][genotype[1]][2] is None or alleles[genotype[0]][genotype[1]][2] == "": - continue - elif alleles[genotype[0]][genotype[1]][0] is None or alleles[genotype[0]][genotype[1]][0] == "": - # print("WARNING: Go here and with a fair sense of justice determine is this is good or bad\n", tmpString, "\n") - continue - elif alleles[genotype[0]][genotype[1]][0] == "Good": - proc_good.stdin.write(f"{tmpString},{gnomAD},{getevidence},{clinvar}\n".encode()) - elif alleles[genotype[0]][genotype[1]][0] == "Bad": - proc_bad.stdin.write(f"{tmpString},{gnomAD},{getevidence},{clinvar}\n".encode()) - - elif countGenotype > 2: - print("IMPORTANT: ", debugBuffer) - print("IMPORTANT: ", debugBuffer) - print("IMPORTANT: Please debug this here as I couldn't find the right alleles [", strand, "] [", - countGenotype, "]\n\n\n") - - proc_good.stdin.close() - proc_bad.stdin.close() + if rsid in tmpBlacklist: + continue + + desc = geno_desc_full + if truncate: + desc = geno_desc + alleles[genotype[0]][genotype[1]][1] = snpedia + alleles[genotype[0]][genotype[1]][4] = generate_Summary_url(rsid, desc) + entry = [alleles[genotype[0]][genotype[1]][2], alleles[genotype[0]][genotype[1]][1], + alleles[genotype[0]][genotype[1]][3], alleles[genotype[0]][genotype[1]][4]] + + # tmpString = ",".join( + # [alleles[genotype[0]][genotype[1]][2], alleles[genotype[0]][genotype[1]][1], + # alleles[genotype[0]][genotype[1]][3], alleles[genotype[0]][genotype[1]][4]]) + # tmpString = tmpString.encode('ascii', 'ignore').decode() # Removing non-ASCII characters + if alleles[genotype[0]][genotype[1]][2] is None or alleles[genotype[0]][genotype[1]][2] == "": + continue + elif alleles[genotype[0]][genotype[1]][0] is None or alleles[genotype[0]][genotype[1]][0] == "": + # print("WARNING: Go here and with a fair sense of justice determine is this is good or bad\n", tmpString, "\n") + continue + elif alleles[genotype[0]][genotype[1]][0] == "Good": + # proc_good.stdin.write(f"{tmpString},{gnomAD},{getevidence},{clinvar}\n".encode()) + entry += [gnomAD, getevidence, clinvar] + entries_good.append(entry) + elif alleles[genotype[0]][genotype[1]][0] == "Bad": + entry += [gnomAD, getevidence, clinvar] + entries_bad.append(entry) + # proc_bad.stdin.write(f"{tmpString},{gnomAD},{getevidence},{clinvar}\n".encode()) + + elif countGenotype > 2: + print("IMPORTANT: ", debugBuffer) + print("IMPORTANT: ", debugBuffer) + print("IMPORTANT: Please debug this here as I couldn't find the right alleles [", strand, "] [", + countGenotype, "]\n\n\n") + + # proc_good.stdin.close() + # proc_bad.stdin.close() + cols = "Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar".split(",") + df_good = pd.DataFrame(entries_good, columns=cols) + df_good = df_good.sort_values(by=["Mag."], ascending=False) + df_good = df_good.drop_duplicates() + df_good.to_csv(f"{temp_dir}/latest.good.reportTable.csv", index=False) + df_bad = pd.DataFrame(entries_bad, columns=cols) + df_bad = df_bad.sort_values(by=["Mag."], ascending=False) + df_bad = df_bad.drop_duplicates() + df_bad.to_csv(f"{temp_dir}/latest.bad.reportTable.csv", index=False) positiveGenosets = {} diff --git a/scripts/plot_generator_fromAncestry.py b/scripts/plot_generator_fromAncestry.py index 2f254eb..7a206eb 100755 --- a/scripts/plot_generator_fromAncestry.py +++ b/scripts/plot_generator_fromAncestry.py @@ -120,6 +120,7 @@ def plot_generator_from_ancestry(eigenvec_path, sample_id, output_dir): ax_zoom.set_yticks([]) fig.savefig(f"{output_dir}/AncestryPlot.pdf", format='pdf', dpi=150, bbox_inches='tight') + fig.savefig(f"{output_dir}/AncestryPlot.png", format='png', dpi=150, bbox_inches='tight') if __name__ == '__main__': fire.Fire(plot_generator_from_ancestry) \ No newline at end of file diff --git a/scripts/to_front_jsons.py b/scripts/to_front_jsons.py new file mode 100755 index 0000000..5d21884 --- /dev/null +++ b/scripts/to_front_jsons.py @@ -0,0 +1,197 @@ +import json +import os +import subprocess +from pathlib import Path +import re + +import sqlite3 +import fire +import pandas as pd +from tqdm import tqdm + + +def link_warper_split(x): + tokens = x.split('}{') + link = tokens[0].split('\\href{')[-1] + text = tokens[1].strip('{}') + return text, link + +def row2entry(row): + data = {} + if 'Magnitude' in row: + magnitude = row['Magnitude'] + elif 'Mag.' in row: + magnitude = row['Mag.'] + else: + magnitude = None + data['magnitude'] = magnitude + + if 'Genotype' in row: + data['genotype'] = row['Genotype'].strip('()').split(';') + + text, _ = link_warper_split(row['Summary']) + desc_full = text + desc_short = text + if len(text) > 50: + desc_short = text[:47] + '...' + data['description'] = { + 'full': desc_full, + 'short': desc_short, + } + id_name, id_link = link_warper_split(row['Identifier']) + data['identifier'] = { + 'link': id_link, + 'name': id_name, + } + db_links = [] + db_links += [{'link': id_link, 'name': 'SNPedia'}] + for db_name in 'GnomAD GetEvidence ClinVar'.split(): + if db_name not in row: + continue + try: + _, link = link_warper_split(row[db_name]) + db_links += [{'link': link, 'name': db_name}] + except: + db_links += [{'link': '', 'name': db_name}] + data['db_links'] = db_links + return data + + + +def geno_tables_to_jsons(table_full_dir, output_dir): + table_full_dir = Path(table_full_dir) + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + tables = { + 'beneficial_data.json': 'latest.good.reportTable.csv', + 'harmful_data.json': 'latest.bad.reportTable.csv', + 'genosets_data.json': 'latest.genoset.reportTable.csv', + } + for json_name, table_name in tables.items(): + path_table = table_full_dir / table_name + df = pd.read_csv(path_table) + data = df.apply(row2entry,axis=1).tolist() + # Path() + # data = table_to_json_data(df) + path_json = output_dir / json_name + path_json.write_text(json.dumps(data, indent=2)) + # pd.DataFrame(data).to_json(path_json, orient='records', indent=2) + + +json_templates = { + 'consequence_type_data.json': { + "labels": [], + "datasets": [{ + "label": "# of Votes", + "data": [], + "backgroundColor": ["#6699cc", "#fff275", "#ff8c42", "#ff3c38", "#a23e48", "#66635b", "#4e5166"], + "borderColor": ["#6699cc", "#fff275", "#ff8c42", "#ff3c38", "#a23e48", "#66635b", "#4e5166"], + "borderWidth": 1 + }] + }, + 'polyphen_summary_data.json': { + "labels": [], + "datasets": [{ + "label": "# of Votes", + "data": [], + "backgroundColor": ["#ff7700","#004777","#a30000","#efd28d"], + "borderColor": ["#ff7700","#004777","#a30000","#efd28d"], + "borderWidth": 1 + }] + }, + 'variant_class_data.json': { + "labels": [], + "datasets": [{ + "label": "# of Votes", + "data": [], + "backgroundColor": ["#ee6352","#59cd90","#3fa7d6","#fac05e","#f79d84"], + "borderColor": ["#ee6352","#59cd90","#3fa7d6","#fac05e","#f79d84"], + "borderWidth": 1 + }] + } +} + +labels_order = { + 'consequence_type_data.json': [ # order? + 'Intron variant', + 'Non coding transcript variant', + 'Others', + 'Intergenic variant', + 'Regulatory region variant', + 'Downstream gene variant', + 'Upstream gene variant', + ], +} + +def vep_table_to_pie_json(df, json_name): + # Initialize JSON data structure + json_data = json_templates[json_name].copy() + if json_name in labels_order: + labels = labels_order[json_name] + else: + df = df.sort_values('Count', ascending=False) + labels = df['Label'].tolist() + + # Fill JSON data structure + json_data["labels"] = [] + data_table = df.set_index('Label')['Count'] + for label in labels: + if label not in data_table: + continue + count = data_table[label] + json_data["labels"].append(f"{label} ({count:.2f}%)") + json_data["datasets"][0]["data"].append(round(count, 2)) + return json_data + + +def vep_labex_tables_to_jsons(vep_dir, latex_dir, output_dir): + if vep_dir is not None: + vep_dir = Path(vep_dir) + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + tables = { + 'consequence_type_data.json': 'latest.consequences_table.csv', + 'polyphen_summary_data.json': 'latest.polyphen_table.csv', + 'variant_class_data.json': 'latest.var_class_table.csv', + } + for json_name, table_name in tables.items(): + path_table = vep_dir / table_name + df = pd.read_csv(path_table) + json_data = vep_table_to_pie_json(df, json_name) + path_json = output_dir / json_name + path_json.write_text(json.dumps(json_data, indent=2)) + + if latex_dir is not None: + latex_dir = Path(latex_dir) + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True, parents=True) + # report_metadata.json + output_path = output_dir / 'report_metadata.json' + df_version = pd.read_csv(latex_dir / 'versionTable.txt', sep='\t') + df_version['db_links'] = df_version.apply(lambda x: [{'name':x['Resource'],'link':link_warper_split(x['Website'])[1]}],axis=1) + df_version = df_version.rename(columns={'Resource':'resource','Version':'version'}) + df_version = df_version.drop(columns=['Website']) + df_version.to_json(output_path, orient='records', indent=2) + + # variant_calling_data.json + output_path = output_dir / 'variant_calling_data.json' + df_sum = pd.read_csv(latex_dir / 'latest.summary.csv',sep='\t') + df_sum = df_sum.reset_index().rename(columns={'index':'key'}) + df_sum['key'] = df_sum['key']+1 + df_sum = df_sum.rename(columns={'Feature':'feature','Count':'count'}) + df_sum.to_json(output_path, orient='records', indent=2) + +funcs = { + 'geno_tables_to_jsons': geno_tables_to_jsons, + 'vep_labex_tables_to_jsons': vep_labex_tables_to_jsons, +} + +class Tools(object): + def __init__(self): + super(Tools, self).__init__() + + +if __name__ == '__main__': + for k, v in funcs.items(): + setattr(Tools, k, staticmethod(v)) + fire.Fire(Tools) diff --git a/scripts/utils.py b/scripts/utils.py index 3176ca8..0de8e0c 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -1,8 +1,15 @@ +import re import subprocess from pathlib import Path import fire +def ucfirst(s): + if len(s) > 0: + return s[0].upper() + s[1:] + else: + return s + def clean_bam_file_noCHR(bam_path, output_dir=None): """ @@ -41,9 +48,105 @@ def clean_bam_file_noCHR(bam_path, output_dir=None): subprocess.run(f"samtools index -@ 6 {output_path}", shell=True) +def get_vep_tables_from_vep(vep_path, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + out_file = open(f"{output_dir}/latest.summary.csv", "w") + out_file.write("Feature\tCount\n") + + recording = 0 + with subprocess.Popen(f'cat {vep_path} | grep "gen_stats"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc: + for line in proc.stdout: + line = line.decode() + line = line.rstrip("\n") + if not line: + continue + + # Note for future generations: This is as silly as Perl gets, but it does the job (for now). I will re-write this with proper parsing at some point, or even better, use an off-the-shelve HTML parsing module. + line = line.replace("", "\n") + line = re.sub(r"", "\n\n", line) + line = re.sub(r"", "\t", line) + line = re.sub(r"|", "", line) + + lines = line.split("\n") + for l in lines: + if "stats_table" in l: + recording += 1 + continue + elif ("" in l.lower()) and (recording == 2): + recording = 0 + + # l = l.strip() + l = re.sub(r"^\s+", "", l) + l = re.sub(r"\s+$", "", l) + l = re.sub(r"\s{2,}", "\t", l) + + if ("variants processed" in l.lower()): + continue + if ("lines of output written" in l.lower()): + continue + + + + if recording == 2: + out_file.write("{}\n".format(l)) + + # in_file.close() + out_file.close() + + with open(vep_path, "r") as in_file: + for line in in_file: + line = line.strip() + if not line or "drawTable" not in line or "[" not in line: + continue + + match = re.search(r"drawTable\('(.+?)'.+?\[(.+)\]", line) + if match: + tab_name = match.group(1) + tab_string = match.group(2) + + counter = 0 + summer = 0 + table = [] + for pair in re.findall(r"\[(.+?)\]", tab_string): + pair = pair.replace("'", "") + pair = pair.replace("_", " ") + if counter == 0: + pair = pair.replace(" ", "") + split = pair.split(",") + table.append(split) + + if counter > 0: + summer += int(split[1]) + + counter += 1 + + out_file = open("{}/latest.{}.csv".format(output_dir, tab_name), "w") + + head = table.pop(0) + head[0] = "Label" + out_file.write("{}\n".format(",".join(head))) + + other = 0 + for d in table: + perc = f"{round(float(d[1]) / summer * 100, 2):0.2f}" + if float(perc) > 1: + d[0] = ucfirst(d[0]) + d[1] = perc + out_file.write("{}\n".format(",".join([str(x) for x in d]))) + else: + other += int(d[1]) + + if other > 0: + perc = f"{round(float(other) / summer * 100, 2):0.2f}" + out_file.write(f"Others,{perc}\n") + # out_file.write("{}\n".format(",".join([str(x) for x in d]))) + funcs = { 'clean_bam_file_noCHR': clean_bam_file_noCHR, + 'get_vep_tables_from_vep': get_vep_tables_from_vep, } @@ -57,9 +160,3 @@ def __init__(self): setattr(UtilsTools, k, staticmethod(v)) fire.Fire(UtilsTools) - -def ucfirst(s): - if len(s) > 0: - return s[0].upper() + s[1:] - else: - return s diff --git a/scripts/vep_tables_from_vep.py b/scripts/vep_tables_from_vep.py deleted file mode 100755 index d1ef4f2..0000000 --- a/scripts/vep_tables_from_vep.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -import os -import re -import subprocess -from pathlib import Path - -import fire - -if __name__ == '__main__': - import sys - sys.path.append(os.path.abspath(os.getcwd())) - -from scripts.utils import ucfirst - - -def get_vep_tables_from_vep(vep_path, output_dir): - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - out_file = open(f"{output_dir}/latest.summary.csv", "w") - out_file.write("Feature\tCount\n") - - recording = 0 - with subprocess.Popen(f'cat {vep_path} | grep "gen_stats"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc: - for line in proc.stdout: - line = line.decode() - line = line.rstrip("\n") - if not line: - continue - - # Note for future generations: This is as silly as Perl gets, but it does the job (for now). I will re-write this with proper parsing at some point, or even better, use an off-the-shelve HTML parsing module. - line = line.replace("", "\n") - line = re.sub(r"", "\n\n", line) - line = re.sub(r"", "\t", line) - line = re.sub(r"|", "", line) - - lines = line.split("\n") - for l in lines: - if "stats_table" in l: - recording += 1 - continue - elif ("" in l.lower()) and (recording == 2): - recording = 0 - - # l = l.strip() - l = re.sub(r"^\s+", "", l) - l = re.sub(r"\s+$", "", l) - l = re.sub(r"\s{2,}", "\t", l) - - if ("variants processed" in l.lower()): - continue - if ("lines of output written" in l.lower()): - continue - - - - if recording == 2: - out_file.write("{}\n".format(l)) - - # in_file.close() - out_file.close() - - with open(vep_path, "r") as in_file: - for line in in_file: - line = line.strip() - if not line or "drawTable" not in line or "[" not in line: - continue - - match = re.search(r"drawTable\('(.+?)'.+?\[(.+)\]", line) - if match: - tab_name = match.group(1) - tab_string = match.group(2) - - counter = 0 - summer = 0 - table = [] - for pair in re.findall(r"\[(.+?)\]", tab_string): - pair = pair.replace("'", "") - pair = pair.replace("_", " ") - if counter == 0: - pair = pair.replace(" ", "") - split = pair.split(",") - table.append(split) - - if counter > 0: - summer += int(split[1]) - - counter += 1 - - out_file = open("{}/latest.{}.csv".format(output_dir, tab_name), "w") - - head = table.pop(0) - head[0] = "Label" - out_file.write("{}\n".format(",".join(head))) - - other = 0 - for d in table: - perc = f"{round(float(d[1]) / summer * 100, 2):0.2f}" - if float(perc) > 1: - d[0] = ucfirst(d[0]) - d[1] = perc - out_file.write("{}\n".format(",".join([str(x) for x in d]))) - else: - other += int(d[1]) - - if other > 0: - perc = f"{round(float(other) / summer * 100, 2):0.2f}" - out_file.write(f"Others,{perc}\n") - # out_file.write("{}\n".format(",".join([str(x) for x in d]))) - - - -if __name__ == '__main__': - fire.Fire(get_vep_tables_from_vep)