From e4f15a48e91ce9e9a8dcd4f82f5fcbce0d65884c Mon Sep 17 00:00:00 2001
From: liuxh <lxhlxhlxh555@gmail.com>
Date: Sun, 27 Aug 2023 20:29:13 +0100
Subject: [PATCH 1/2] refactor code structure

---
 main.py                                       | 256 ++++++-------
 ...omeChronicler_afogeno_generator_fromBAM.py | 193 ----------
 ...omeChronicler_afogeno_generator_fromVCF.py | 156 --------
 ...meChronicler_ancestry_generator_fromBAM.py | 299 ---------------
 ...meChronicler_ancestry_generator_fromVCF.py | 229 ------------
 ...Chronicler_quickFilterFinalReportTables.py |  33 --
 scripts/__init__.py                           |   0
 scripts/afogeno_generator.py                  | 193 ++++++++++
 scripts/ancestry_generator.py                 | 350 ++++++++++++++++++
 ...mAfoGeno.py => genotables_from_afogeno.py} | 138 +++++--
 ...omeChronicler_XLSX_fromTables.py => io.py} | 110 +++++-
 ...cestry.R => plot_generator_fromAncestry.R} |  10 +-
 ...stry.py => plot_generator_fromAncestry.py} |   4 +-
 scripts/utils.py                              |  65 ++++
 ...bles_fromVEP.py => vep_tables_from_vep.py} |  38 +-
 15 files changed, 940 insertions(+), 1134 deletions(-)
 delete mode 100755 scripts/GenomeChronicler_afogeno_generator_fromBAM.py
 delete mode 100755 scripts/GenomeChronicler_afogeno_generator_fromVCF.py
 delete mode 100755 scripts/GenomeChronicler_ancestry_generator_fromBAM.py
 delete mode 100755 scripts/GenomeChronicler_ancestry_generator_fromVCF.py
 delete mode 100755 scripts/GenomeChronicler_quickFilterFinalReportTables.py
 create mode 100644 scripts/__init__.py
 create mode 100755 scripts/afogeno_generator.py
 create mode 100755 scripts/ancestry_generator.py
 rename scripts/{GenomeChronicler_genoTables_fromAfoGeno.py => genotables_from_afogeno.py} (80%)
 rename scripts/{GenomeChronicler_XLSX_fromTables.py => io.py} (51%)
 rename scripts/{GenomeChronicler_plot_generator_fromAncestry.R => plot_generator_fromAncestry.R} (97%)
 rename scripts/{GenomeChronicler_plot_generator_fromAncestry.py => plot_generator_fromAncestry.py} (99%)
 create mode 100644 scripts/utils.py
 rename scripts/{GenomeChronicler_vepTables_fromVEP.py => vep_tables_from_vep.py} (80%)

diff --git a/main.py b/main.py
index 5bffa1d..7098656 100644
--- a/main.py
+++ b/main.py
@@ -1,16 +1,27 @@
 #!/usr/bin/env python3
 
 import argparse
-import re
+import shutil
 import subprocess
 import sys
 from datetime import datetime
 from pathlib import Path
-import shutil
-import time
-import os
 
-from tqdm import tqdm
+from scripts.afogeno_generator import generate_afogeno_pipeline
+from scripts.ancestry_generator import generate_ancestry_pipeline
+from scripts.genotables_from_afogeno import geno_tables_from_afogeno
+from scripts.io import render_latex, export_genotypes_xlsx_from_csvs
+from scripts.plot_generator_fromAncestry import plot_generator_from_ancestry
+from scripts.utils import clean_bam_file_noCHR
+from scripts.vep_tables_from_vep import get_vep_tables_from_vep
+
+def check_path_exist(path, file_type,
+                     error_template="--- ERROR: The {file_type} specified in the command line wasn't found [{"
+                                    "path}], please check the provided path and try again ---\n",
+                     exit_code=404):
+    if path and not Path(path).exists():
+        print(error_template.format(file_type=file_type, path=path), file=sys.stderr)
+        sys.exit(exit_code)
 
 
 def print_header_ascii():
@@ -28,16 +39,6 @@ def print_header_ascii():
 """)
 
 
-def cleanBAMfile_noCHR(filename, verbose=False):
-    # Reheader the BAM file and create an index
-    subprocess.run(
-        f"samtools reheader -c 'perl -pe \"s/^(@SQ.*)(\tSN:)chr/\$1\$2/\"' {filename} > {filename}.clean.BAM",
-        shell=True)
-
-    print(f"\t +++ INFO: Indexing the BAM file")
-    subprocess.run(f"samtools index -@ 6 {filename}.clean.BAM", shell=True)
-
-
 def get_parser():
     parser = argparse.ArgumentParser(description="GenomeChronicler")
     parser.add_argument('-d', '--debug', action='store_true', help='Debug flag')
@@ -83,7 +84,7 @@ def get_parser():
     parser.add_argument('--verbose', dest='verbose', type=bool, default=False)
     return parser
 
-def main_druid():
+def main():
     import os
 
     ### Processing Needed steps ###
@@ -94,9 +95,6 @@ def main_druid():
         dir = "/GenomeChronicler/"
 
     resultsdir = os.getcwd()
-    template_withVEP = f"{dir}templates/reportTemplate_withVEP.tex"
-    template_ohneVEP = f"{dir}templates/reportTemplate_ohneVEP.tex"
-    template = template_ohneVEP
 
     # Defining input options and their default values...
     parser = get_parser()
@@ -124,15 +122,6 @@ def main_druid():
     start_time = datetime.now()
 
     ##################### check file existence and proceed
-
-    def check_path_exist(path, file_type,
-                         error_template="--- ERROR: The {file_type} specified in the command line wasn't found [{"
-                                        "path}], please check the provided path and try again ---\n",
-                         exit_code=404):
-        if path and not Path(path).exists():
-            print(error_template.format(file_type=file_type, path=path), file=sys.stderr)
-            sys.exit(exit_code)
-
     if BAM_file:
         check_path_exist(BAM_file, "BAM file", exit_code=404)
     elif gVCF_file:
@@ -151,7 +140,6 @@ def check_path_exist(path, file_type,
     check_path_exist(templateParam, "LaTeX Template file", exit_code=501)
 
     sample = None
-    import os
     if BAM_file:
         sample = Path(BAM_file).name.split('.')[0]
         # sample = os.path.splitext(os.path.basename(BAM_file))[0]
@@ -169,153 +157,125 @@ def check_path_exist(path, file_type,
               file=sys.stderr)
         sys.exit(555)
 
-    output_sample_dir = f"{resultsdir}/results/results_{sample}"
-    temp_dir = f"{output_sample_dir}/temp"
-
-    os.system(f"mkdir -p {temp_dir}")
-    check_path_exist(temp_dir, "Results directory", exit_code=102,
-                     error_template="\t --- ERROR: Results directory [{path}] can't be found and could not be "
-                                    "created. Please check permissions and try again ---\n")
-
-    LOGFILE2 = f"{output_sample_dir}/{sample}.processingLog.stderr.txt"
+    output_dir = Path(resultsdir)
+    temp_dir = output_dir/"temp"
+    temp_dir.mkdir(parents=True, exist_ok=True)
 
-    # Dump parameters
-    if debugFlag:
-        print(f"SCRIPT = {scriptName}", file=sys.stderr)
-        print(f"DEBUG = {debugFlag}", file=sys.stderr)
-        print(f"BAM = {BAM_file}", file=sys.stderr)
-        print(f"VCF = {gVCF_file}", file=sys.stderr)
-        print(f"VEP = {VEP_file}\n", file=sys.stderr)
-        print(f"GATKthreads = {GATKthreads}\n\n", file=sys.stderr)
+    # os.system(f"mkdir -p {temp_dir}")
+    # check_path_exist(temp_dir, "Results directory", exit_code=102,
+    #                  error_template="\t --- ERROR: Results directory [{path}] can't be found and could not be "
+    #                                 "created. Please check permissions and try again ---\n")
 
-        print(f"TEMPLATE = {template}", file=sys.stderr)
-        print(f"SAMPLE = {sample}", file=sys.stderr)
-        print(f"DIR = {dir}", file=sys.stderr)
-        print(f"TMPDIR = {temp_dir}\n", file=sys.stderr)
-        #  print(f"LOGFILE1 = {LOGFILE1}\n", file=sys.stderr)
-        print(f"LOGFILE2 = {LOGFILE2}\n\n", file=sys.stderr)
-
-        sys.exit(0)
+    LOGFILE2 = f"{output_dir}/{sample}.processingLog.stderr.txt"
 
     # Main bit of code
     print_header_ascii()
-
     print(f"\t +++ INFO: Starting Processing at {start_time.strftime('%Y-%m-%d %H:%M:%S')}", file=sys.stderr)
     print(f"\t +++ INFO: Opening Log File at: {LOGFILE2}", file=sys.stderr)
 
     subprocess.run(f"echo > {LOGFILE2}", shell=True)
 
-    clean_sample = sample.replace("_", "\\_")
-
-    with open(f"{output_sample_dir}/SampleName.txt", "w") as sample_name_file:
-        sample_name_file.write(clean_sample)
-
+    output_vep_dir = None
     if VEP_file:
-        template = template_withVEP if not templateParam else templateParam
-
         print("\t +++ INFO: Preprocessing VEP file")
-        cmd = f"python3 {dir}scripts/GenomeChronicler_vepTables_fromVEP.py {VEP_file} {output_sample_dir}/"
-        subprocess.run(cmd, shell=True)
+        output_vep_dir = output_dir/"vep_dir"
+        get_vep_tables_from_vep(VEP_file, output_vep_dir)
 
     if BAM_file:
         print("\t +++ INFO: Preprocessing BAM file")
 
-        if not os.path.exists(f"{BAM_file}.clean.BAM") or not os.path.exists(f"{BAM_file}.clean.BAM.bai"):
-            cleanBAMfile_noCHR(BAM_file, verbose=verbose)
-
-        BAM_file = f"{BAM_file}.clean.BAM"
-
-        print("\t +++ INFO: Generating Ancestry")
-        cmd = f"python3 {dir}scripts/GenomeChronicler_ancestry_generator_fromBAM.py {BAM_file} {output_sample_dir}/ {GATKthreads} 2>> {LOGFILE2}"
-        subprocess.run(cmd, shell=True)
-
-        print("\t +++ INFO: Generating Ancestry Plots")
-        output_path = f"{output_sample_dir}/AncestryPlot.pdf"
-        if not Path(output_path).exists():
-            cmd = f"SAMPLE={sample} ID={sample} DIR={resultsdir} R CMD BATCH {dir}scripts/GenomeChronicler_plot_generator_fromAncestry.R"
-            subprocess.run(cmd, shell=True)
-
-        print("\t +++ INFO: Generating Genotypes Files")
-        cmd = f"python3 {dir}scripts/GenomeChronicler_afogeno_generator_fromBAM.py {BAM_file} {output_sample_dir}/ {GATKthreads} 2>> {LOGFILE2}"
-        subprocess.run(cmd, shell=True)
-        AFOgeno_file = f"{temp_dir}/{sample}.afogeno38.txt"
+        output_path = temp_dir/f"{sample}.clean.BAM"
+        if not output_path.exists():
+            clean_bam_file_noCHR(BAM_file, temp_dir)
 
+        input_path = output_path
     elif gVCF_file:
         print("\t +++ INFO: Preprocessing VCF file")
-
-        print("\t +++ INFO: Generating Ancestry")
-        if not Path(f"{output_sample_dir}/{sample}.genotypingVCF.vcf.gz").exists():
-            cmd = f"python3 {dir}scripts/GenomeChronicler_ancestry_generator_fromVCF.py {gVCF_file} {sample} {output_sample_dir} 2>> {LOGFILE2}"
-            subprocess.run(cmd, shell=True)
-        # if not Path(f"{output_sample_dir}/{sample}.genotypingVCF.vcf.gz").exists():
-        cmd = f"python3 {dir}scripts/GenomeChronicler_plot_generator_fromAncestry.py {output_sample_dir}/temp/{sample}_1kGP_pruned_pca_20.eigenvec {sample} {output_sample_dir}/"
-        subprocess.run(cmd, shell=True)
-
-        print("\t +++ INFO: Generating Genotypes Files")
-        cmd = f"python3 {dir}scripts/GenomeChronicler_afogeno_generator_fromVCF.py {gVCF_file} {sample} {output_sample_dir} 2>> {LOGFILE2}"
-        subprocess.run(cmd, shell=True)
-        AFOgeno_file = f"{temp_dir}/{sample}.afogeno38.txt"
-
+        input_path = gVCF_file
     else:
         print("\t +++ ERROR: No BAM or VCF file provided. Exiting.")
         sys.exit()
 
-    print("\t +++ INFO: Generating Genome Report Tables")
-    if not Path(f"{output_sample_dir}/latest.good.reportTable.csv").exists():
-        cmd = f"python3 {dir}scripts/GenomeChronicler_genoTables_fromAfoGeno.py {AFOgeno_file} {output_sample_dir}/ 2>> {LOGFILE2}"
-        subprocess.run(cmd, shell=True)
+    print("\t +++ INFO: Generating Ancestry")
+    output_ancestry_dir = output_dir/"ancestry"
+    output_ancestry_path = output_ancestry_dir/'vcf_ancestry'/'ancestry.rsIDs.gvcf.gz'
+    if not output_ancestry_path.exists():
+        # cmd = f"python3 {dir}scripts/ancestry_generator.py ancestry_generator {BAM_file} {output_dir} {GATKthreads} 2>> {LOGFILE2}"
+        # subprocess.run(cmd, shell=True)
+        generate_ancestry_pipeline(input_path, output_ancestry_dir, GATKthreads)
+
+    print("\t +++ INFO: Generating Ancestry Plots")
+    pca_eigenvec_path = f"{output_ancestry_dir}/pca_ancestry/1kGP_pruned_pca_20.eigenvec"
+    output_plot_path = f"{output_dir}/AncestryPlot.pdf"
+    if not Path(output_plot_path).exists():
+        # cmd = f"PCA_PATH={pca_eigenvec_path} ID={sample} OUTPUT_DIR={output_plot_path} R CMD BATCH {dir}scripts/plot_generator_fromAncestry.R"
+        # ret = subprocess.run(cmd, shell=True)
+        # print(ret)
+        plot_generator_from_ancestry(pca_eigenvec_path, sample, output_dir)
+        # exit()
+
+    print("\t +++ INFO: Generating Genotypes Files")
+    output_afogeno_dir = output_dir/"afogeno"
+    output_afogeno_path = output_ancestry_dir/'vcf_afogeno'/'afogeno.rsIDs.gvcf.gz'
+    afogeno_file = output_afogeno_dir/f"afogeno38.txt"
+    if not afogeno_file.exists():
+        # cmd = f"python3 {dir}scripts/ancestry_generator.py ancestry_generator {BAM_file} {output_dir} {GATKthreads} 2>> {LOGFILE2}"
+        # subprocess.run(cmd, shell=True)
+        generate_afogeno_pipeline(input_path, output_afogeno_dir, GATKthreads)
 
-    print("\t +++ INFO: Filtering Report Tables")
-    cmd = f"python3 {dir}scripts/GenomeChronicler_quickFilterFinalReportTables.py {output_sample_dir}/latest.good.reportTable.csv"
-    subprocess.run(cmd, shell=True)
-    cmd = f"python3 {dir}scripts/GenomeChronicler_quickFilterFinalReportTables.py {output_sample_dir}/latest.bad.reportTable.csv"
-    subprocess.run(cmd, shell=True)
+    print("\t +++ INFO: Generating Genome Report Tables")
+    output_tables_dir = output_dir/"tables"
+    geno_tables_from_afogeno(afogeno_file, output_tables_dir)
+    for path in output_tables_dir.glob("*.csv"):
+        shutil.copy(path, output_dir)
 
     print("\t +++ INFO: Combining Excel Tables")
-    cmd = f"python3 {dir}scripts/GenomeChronicler_XLSX_fromTables.py {output_sample_dir}/ {output_sample_dir}/{sample}genotypes{dtag}.xlsx"
-    subprocess.run(cmd, shell=True)
+    output_xlsx_path = output_tables_dir/f"{sample}.genotypes.{dtag}.xlsx"
+    export_genotypes_xlsx_from_csvs(output_tables_dir, output_xlsx_path)
+    shutil.copy(output_xlsx_path, output_dir) # copy to main results dir
 
+    # render latex template
     print("\t +++ INFO: Compiling Genome Report")
-    TEMPLATETEX = f"{sample}_report_{dtag}"
-    shutil.copy(template, f"{output_sample_dir}/{TEMPLATETEX}.tex")
-    shutil.copy(f"{dir}templates/versionTable.txt", f"{output_sample_dir}/")
-    shutil.copy(f"{dir}templates/GeneStructure.pdf", f"{output_sample_dir}/")
-
-    def run_latex():
-        cwd = os.getcwd()
-        os.chdir(f"{resultsdir}/results/results_{sample}")
-        for _ in range(3):
-            cmd = f"pdflatex -interaction=nonstopmode {TEMPLATETEX}.tex 2> /dev/null > /dev/null"
-            subprocess.run(cmd, shell=True)
-        os.chdir(cwd)
-
-    run_latex()
-
-    if no_clean_temporary_files is False:
-        print("\t +++ INFO: Cleaning up Temporary and Intermediate Files")
-        if BAM_file is not None:
-            os.remove(BAM_file)
-            os.remove(f"{BAM_file}.bai")
-
-        shutil.rmtree(f"{temp_dir}")
-        for file in Path(f"{output_sample_dir}/").glob("latest*.csv"):
-            os.remove(file)
-
-        subprocess.run(f"rm -rf {output_sample_dir}/versionTable.txt", shell=True)
-        subprocess.run(f"rm -rf {output_sample_dir}/GeneStructure.pdf", shell=True)
-        subprocess.run(f"rm -rf {output_sample_dir}/{TEMPLATETEX}.out", shell=True)
-        subprocess.run(f"rm -rf {output_sample_dir}/texput.log", shell=True)
-        subprocess.run(f"rm -rf {output_sample_dir}/{TEMPLATETEX}.aux", shell=True)
-        subprocess.run(f"rm -rf {output_sample_dir}/{TEMPLATETEX}.log", shell=True)
-        subprocess.run(f"rm -rf {output_sample_dir}/{TEMPLATETEX}.tex", shell=True)
-        subprocess.run(f"rm -rf {dir}GenomeChronicler_plot_generator_fromAncestry.Rout", shell=True)
-
-    time.sleep(1)
-    if BAM_file is not None:
-        BAM_file = BAM_file.replace(".clean.BAM", "")
-        print(
-            f"\n\t +++ DONE: Finished GenomeChronicler for file [ {BAM_file} ] in {datetime.now() - start_time} seconds")
+    template_withVEP = f"{dir}templates/reportTemplate_withVEP.tex"
+    template_ohneVEP = f"{dir}templates/reportTemplate_ohneVEP.tex"
+    template = template_ohneVEP
+    if VEP_file:
+        template = template_withVEP if not templateParam else templateParam
+    run_latex_dir = output_dir/"latex"
+    render_latex(output_tables_dir, sample, run_latex_dir,
+                 plot_path=output_plot_path, vep_dir=output_vep_dir,
+                 template_dir="templates", latex_template=template,
+                 )
+    latex_pdf_path = run_latex_dir/f"{sample}_report.pdf"
+    shutil.copy(latex_pdf_path, output_dir) # copy to main results dir
+
+
+    # if no_clean_temporary_files is False:
+    #     print("\t +++ INFO: Cleaning up Temporary and Intermediate Files")
+    #     if BAM_file is not None:
+    #         os.remove(BAM_file)
+    #         os.remove(f"{BAM_file}.bai")
+    #
+    #     shutil.rmtree(f"{temp_dir}")
+    #     for file in Path(f"{output_dir}/").glob("latest*.csv"):
+    #         os.remove(file)
+    #
+    #     subprocess.run(f"rm -rf {output_dir}/versionTable.txt", shell=True)
+    #     subprocess.run(f"rm -rf {output_dir}/GeneStructure.pdf", shell=True)
+    #     subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.out", shell=True)
+    #     subprocess.run(f"rm -rf {output_dir}/texput.log", shell=True)
+    #     subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.aux", shell=True)
+    #     subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.log", shell=True)
+    #     subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.tex", shell=True)
+    #     subprocess.run(f"rm -rf {dir}GenomeChronicler_plot_generator_fromAncestry.Rout", shell=True)
+
+    # # time.sleep(1)
+    # if BAM_file is not None:
+    #     BAM_file = BAM_file.replace(".clean.BAM", "")
+    #     print(
+    #         f"\n\t +++ DONE: Finished GenomeChronicler for file [ {BAM_file} ] in {datetime.now() - start_time} seconds")
 
 
 if __name__ == '__main__':
-    main_druid()
+    main()
+
diff --git a/scripts/GenomeChronicler_afogeno_generator_fromBAM.py b/scripts/GenomeChronicler_afogeno_generator_fromBAM.py
deleted file mode 100755
index cbbdfe4..0000000
--- a/scripts/GenomeChronicler_afogeno_generator_fromBAM.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/usr/bin/env python3
-import subprocess
-import os
-import sys
-import gzip
-from os.path import basename
-from pathlib import Path
-
-import fire
-
-
-def build_genotype(a1, a2, gen):
-    alleles = [a1] + a2.split(",")
-    extras = [""] * len(alleles)
-    l1 = len(alleles[0])
-    extra = ""
-    for i in range(1, len(alleles)):
-        extras[i] = ""
-        l2 = len(alleles[i])
-        if l1 > 1 or l2 > 1:
-            if l2 > l1:
-                alleles[0] = "-"
-                alleles[i] = alleles[i][1:]
-                extras[i] = "I"
-            else:
-                alleles[0] = alleles[0][1:]
-                alleles[i] = "-"
-                extras[i] = "D"
-    res = [alleles[int(i)] for i in gen.split("/")]
-    extra = ";".join([extras[int(i)] for i in gen.split("/")]).strip(";")
-    return res, extra
-
-
-def uniq(l):
-    return list(set(l))
-
-
-def afogeno_generator_from_BAM(input_BAM, resultsdir="", numThreads=4,
-                               dir_reference='reference', dir_software='software', workdir="",
-                               ):
-    if 'SINGULARITY_NAME' in os.environ:
-        workdir = "/GenomeChronicler/"
-
-    # resultsdir = workdir
-    ref_bed = workdir + f"{dir_reference}/snps.19-114.unique.nochr.bed"
-    gatk = workdir + f"{dir_software}/GenomeAnalysisTK.jar"
-    ref_hs38 = workdir + f"{dir_reference}/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa"
-    bcftools = "bcftools"
-    samtools = "samtools"
-    bgzip = workdir + f"{dir_software}/bgzip"
-    tabix = workdir + f"{dir_software}/tabix"
-
-    # if not len(sys.argv) > 1:
-    #     sys.exit("Please provide a single base recalibrated BAM file to this script and optional output directory\n")
-
-    # input_BAM = sys.argv[1]
-    sample = basename(input_BAM).rsplit(".", 1)[0]
-    sample = sample.replace(".recal", "").replace(".bam.clean", "")
-
-    # if len(sys.argv) > 2:
-    #     resultsdir = sys.argv[2]
-
-    # if len(sys.argv) > 3:
-    #     numThreads = sys.argv[3]
-
-    subprocess.run(["mkdir", "-p", f"{resultsdir}/temp"])
-
-    if not os.path.exists(input_BAM):
-        sys.exit(f"The BAM file specified does not exist, please double-check the path and try again [{input_BAM}]")
-
-    if not os.path.exists(ref_bed):
-        sys.exit(f"The BED file specified does not exist, please double-check the path and try again [{ref_bed}]")
-
-    if not os.path.exists(ref_bed + ".gz"):
-        # Sort and compress BED
-        subprocess.run(["sort", "-k", "1,1", "-k2,2n", ref_bed, ">", "tempAAA"])
-        subprocess.run(["mv", "tempAAA", ref_bed])
-        subprocess.run([bgzip, "-c", f"{ref_bed}" + " > " + f"{ref_bed}.gz"])
-
-        # Force tabix re-index
-        subprocess.run([tabix, "-p", "bed", f"{ref_bed}.gz"])
-
-    if not os.path.exists(f"{ref_bed}.gz.tbi"):
-        # tabix index
-        subprocess.run([tabix, "-p", "bed", f"{ref_bed}.gz"])
-
-    # Define the command to run GATK HaplotypeCaller
-    output_path = f"{resultsdir}/temp/{sample}.g.vcf"
-    if not Path(output_path).exists():
-        runstr = f"java -jar {gatk}"
-        runstr += f" -T HaplotypeCaller -R {ref_hs38}"
-        runstr += f" -I {input_BAM} -nct {numThreads}"
-        runstr += f" --emitRefConfidence GVCF -L {ref_bed}"
-        runstr += f" -o {output_path}"
-
-        # Run GATK HaplotypeCaller
-        # print(runstr)
-        os.system(runstr)
-    # exit()
-
-    # Define the command to run GATK GenotypeGVCFs
-    output_path2 = f"{resultsdir}/temp/{sample}.genotypes.vcf"
-    if not Path(output_path2).exists():
-        runstr2 = f"java -jar {gatk}"
-        runstr2 += f" -T GenotypeGVCFs -R {ref_hs38}"
-        runstr2 += f" --variant {output_path}"
-        runstr2 += f" -allSites -o {output_path2}"
-        runstr2 += " -stand_emit_conf 10 -stand_call_conf 30"
-
-        # Run GATK GenotypeGVCFs
-        os.system(runstr2)
-
-    # Add rsIDs to the VCF file using bcftools and bgzip
-    # os.system(f"cat {resultsdir}/temp/{sample}.genotypes.vcf | {bcftools} annotate -a {ref_bed}.gz -c CHROM,-,POS,ID | {bgzip} -c > {resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz")
-    vcf_path = f"{resultsdir}/temp/{sample}.genotypes.vcf"
-    bed_path = f"{ref_bed}.gz"
-    out_path = f"{resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz"
-
-    cat_proc = subprocess.Popen(["cat", vcf_path], stdout=subprocess.PIPE)
-    annotate_proc = subprocess.Popen([bcftools, "annotate", "-a", bed_path, "-c", "CHROM,-,POS,ID"],
-                                     stdin=cat_proc.stdout, stdout=subprocess.PIPE)
-    bgzip_proc = subprocess.Popen(["bgzip", "-c"], stdin=annotate_proc.stdout, stdout=subprocess.PIPE)
-
-    with open(out_path, "wb") as f:
-        f.write(bgzip_proc.communicate()[0])
-
-    ## Generate AFO Geno 38 file
-    gen_data = {}
-    counter_template = {}
-
-    # with gzip.open(ref_bed, "rt") as IN:
-    with subprocess.Popen(["gzip", "-dcf", ref_bed], stdout=subprocess.PIPE).stdout as IN:
-        for line in IN:
-            line = line.strip()
-            if not line:
-                continue
-
-            # data = line.split(b"\t")
-            data = line.decode().split("\t")
-            # gen_data.setdefault(data[0], {})[int(data[2])] = data
-            gen_data[(data[0], data[2])] = data
-            counter_template[(data[0], data[2])] = 0
-
-    VCFfilename = f"{resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz"
-    debugCounter = {}
-
-    with subprocess.Popen(
-            f'gzip -dcf {VCFfilename} | grep -v "0/0:0:0:0:0,0,0" | grep -v LowQual | cut -f 1,2,4,5,10 | uniq',
-            shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc1, \
-            open(f"{resultsdir}/temp/{sample}.afogeno38.txt", "w") as out_file:
-        # for line_bytes in proc1.stdout:
-        for line_bytes in proc1.stdout:
-            line = line_bytes.decode().rstrip()
-
-            if not line or line.startswith("#"):
-                continue
-
-            line = line.replace(",<NON_REF>", "")
-            data = line.split("\t")
-
-            tmp = data[4].split(":")
-            data[4] = tmp[0]
-
-            genotype = ["NA", "NA"]
-            extra = "NA"
-
-            pos = (data[0], data[1])
-
-            if data[4] != "./.":
-                a1, a2 = data[2], data[3]
-                genotype, extra = build_genotype(a1, a2, data[4])
-            else:
-                # No reads there
-                debugCounter[pos] = debugCounter.get(pos, 0) + 1
-                continue
-
-            data.extend(genotype)
-            data.append(extra)
-
-            # pos = (data[0], data[1])
-            if pos in gen_data:
-                d2 = gen_data[pos]
-                out_data = "\t".join(d2) + "\t" + "\t".join(data) + "\n"
-                out_file.write(out_data)
-            else:
-                debugCounter[pos] = debugCounter.get(pos, 0) + 1
-
-    # Move the original VCF file to a new location
-    os.rename(VCFfilename, f"{resultsdir}/{sample}.genotypingVCF.vcf.gz")
-
-
-if __name__ == '__main__':
-    fire.Fire(afogeno_generator_from_BAM)
diff --git a/scripts/GenomeChronicler_afogeno_generator_fromVCF.py b/scripts/GenomeChronicler_afogeno_generator_fromVCF.py
deleted file mode 100755
index 2676e88..0000000
--- a/scripts/GenomeChronicler_afogeno_generator_fromVCF.py
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env python3
-import subprocess
-import os
-import sys
-import gzip
-from os.path import basename
-from tqdm import tqdm
-from pathlib import Path
-
-import fire
-
-
-def build_genotype(a1, a2, gen):
-    alleles = [a1] + a2.split(",")
-    extras = [""] * len(alleles)
-    l1 = len(alleles[0])
-    extra = ""
-    for i in range(1, len(alleles)):
-        extras[i] = ""
-        l2 = len(alleles[i])
-        if l1 > 1 or l2 > 1:
-            if l2 > l1:
-                alleles[0] = "-"
-                alleles[i] = alleles[i][1:]
-                extras[i] = "I"
-            else:
-                alleles[0] = alleles[0][1:]
-                alleles[i] = "-"
-                extras[i] = "D"
-    res = [alleles[int(i)] for i in gen.split("/")]
-    extra = ";".join([extras[int(i)] for i in gen.split("/")]).strip(";")
-    return res, extra
-
-
-def uniq(l):
-    return list(set(l))
-
-
-def afogeno_generator_from_VCF(input_VCF, sample, resultsdir="",
-                               dir_reference='reference', dir_software='software', workdir="", ):
-    if 'SINGULARITY_NAME' in os.environ:
-        workdir = "/GenomeChronicler/"
-
-    ref_bed = workdir + f"{dir_reference}/snps.19-114.unique.nochr.bed"
-    bcftools = "bcftools"
-    bgzip = workdir + f"{dir_software}/bgzip"
-    tabix = workdir + f"{dir_software}/tabix"
-
-    # sample = basename(input_VCF).rsplit(".", 1)[0]
-
-    subprocess.run(["mkdir", "-p", f"{resultsdir}/temp"])
-
-    if not os.path.exists(input_VCF):
-        sys.exit(f"The VCF file specified does not exist, please double-check the path and try again [{input_VCF}]")
-
-    if not os.path.exists(ref_bed):
-        sys.exit(f"The BED file specified does not exist, please double-check the path and try again [{ref_bed}]")
-
-    if not os.path.exists(ref_bed + ".gz"):
-        # Sort and compress BED
-        subprocess.run(["sort", "-k", "1,1", "-k2,2n", ref_bed, ">", "tempAAA"])
-        subprocess.run(["mv", "tempAAA", ref_bed])
-        subprocess.run([bgzip, "-c", f"{ref_bed}" + " > " + f"{ref_bed}.gz"])
-
-        # Force tabix re-index
-        subprocess.run([tabix, "-p", "bed", f"{ref_bed}.gz"])
-
-    if not os.path.exists(f"{ref_bed}.gz.tbi"):
-        # tabix index
-        subprocess.run([tabix, "-p", "bed", f"{ref_bed}.gz"])
-
-    # Add rsIDs to the VCF file using bcftools and bgzip
-    # os.system(f"cat {resultsdir}/temp/{sample}.genotypes.vcf | {bcftools} annotate -a {ref_bed}.gz -c CHROM,-,POS,ID | {bgzip} -c > {resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz")
-    # vcf_path = f"{resultsdir}/temp/{sample}.genotypes.vcf"
-
-    vcf_path = input_VCF
-    bed_path = f"{ref_bed}.gz"
-    out_path = f"{resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz"
-
-    # subprocess.run(f"cat {vcf_path} | {bcftools} annotate -a {bed_path} -c CHROM,-,POS,ID | {bgzip} -c > {out_path}", shell=True)
-
-    cat_proc = subprocess.Popen(["cat", vcf_path], stdout=subprocess.PIPE)
-    annotate_proc = subprocess.Popen([bcftools, "annotate", "-a", bed_path, "-c", "CHROM,-,POS,ID"],
-                                     stdin=cat_proc.stdout, stdout=subprocess.PIPE)
-    bgzip_proc = subprocess.Popen(["bgzip", "-c"], stdin=annotate_proc.stdout, stdout=subprocess.PIPE)
-
-    with open(out_path, "wb") as f:
-        f.write(bgzip_proc.communicate()[0])
-
-    # Generate AFO Geno 38 file
-    gen_data = {}
-    counter_template = {}
-
-    # with gzip.open(ref_bed, "rt") as IN:
-    with subprocess.Popen(["gzip", "-dcf", ref_bed], stdout=subprocess.PIPE).stdout as IN:
-        for line in IN:
-            line = line.strip()
-            if not line:
-                continue
-
-            # data = line.split(b"\t")
-            data = line.decode().split("\t")
-            # gen_data.setdefault(data[0], {})[int(data[2])] = data
-            gen_data[(data[0], data[2])] = data
-            counter_template[(data[0], data[2])] = 0
-
-    VCFfilename = f"{resultsdir}/temp/{sample}.genotypes.rsIDs.vcf.gz"
-    debugCounter = {}
-
-    with subprocess.Popen(
-            f'gzip -dcf {VCFfilename} | grep -v "0/0:0:0:0:0,0,0" | grep -v LowQual | cut -f 1,2,4,5,10 | uniq',
-            shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc1, \
-            open(f"{resultsdir}/temp/{sample}.afogeno38.txt", "w") as out_file:
-        # for line_bytes in proc1.stdout:
-        for line_bytes in proc1.stdout:
-            line = line_bytes.decode().rstrip()
-
-            if not line or line.startswith("#"):
-                continue
-
-            line = line.replace(",<NON_REF>", "")
-            data = line.split("\t")
-
-            tmp = data[4].split(":")
-            data[4] = tmp[0]
-
-            genotype = ["NA", "NA"]
-            extra = "NA"
-
-            pos = (data[0], data[1])
-
-            if data[4] != "./.":
-                a1, a2 = data[2], data[3]
-                genotype, extra = build_genotype(a1, a2, data[4])
-            else:
-                # No reads there
-                debugCounter[pos] = debugCounter.get(pos, 0) + 1
-                continue
-
-            data.extend(genotype)
-            data.append(extra)
-
-            # pos = (data[0], data[1])
-            if pos in gen_data:
-                d2 = gen_data[pos]
-                out_data = "\t".join(d2) + "\t" + "\t".join(data) + "\n"
-                out_file.write(out_data)
-            else:
-                debugCounter[pos] = debugCounter.get(pos, 0) + 1
-
-    # Move the original VCF file to a new location
-    os.rename(VCFfilename, f"{resultsdir}/{sample}.genotypingVCF.vcf.gz")
-
-
-if __name__ == '__main__':
-    fire.Fire(afogeno_generator_from_VCF)
diff --git a/scripts/GenomeChronicler_ancestry_generator_fromBAM.py b/scripts/GenomeChronicler_ancestry_generator_fromBAM.py
deleted file mode 100755
index 78cbbe4..0000000
--- a/scripts/GenomeChronicler_ancestry_generator_fromBAM.py
+++ /dev/null
@@ -1,299 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import sys
-import subprocess
-from os.path import basename
-
-from pathlib import Path
-import fire
-
-def _get_list_of_snps_from_1kGP(initialBIM, sample, resultsdir, hasCHRflag, bgzip, tabix):
-    """
-    Get list of SNPs from 1kGP
-
-    Input files:
-        - initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim"
-    Output files:
-        - f"{dir}temp/1kGP_GRCh38_nonAT_CG.bed.gz"
-        - f"{dir}temp/1kGP_GRCh38_nonAT_CG.bed.gz.tbi"
-
-    """
-    output_path = f"{resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed"
-    if Path(output_path).exists():
-        return
-    with open(initialBIM, "r") as inf, open(output_path, "w") as outf:
-        outf.write("#CHROM\tST0\tPOS\tID\n")
-        for line in inf:
-            a = line.strip().split("\t")
-            if hasCHRflag == 1 and not a[0].startswith("chr"):
-                a[0] = "chr" + a[0]
-            pp = a[0]
-            pp = pp.replace('chr','')
-            if int(pp) > 22:
-                continue
-            if a[4] == 'T' and a[5] == 'A':
-                continue
-            if a[4] == 'A' and a[5] == 'T':
-                continue
-            if a[4] == 'C' and a[5] == 'G':
-                continue
-            if a[4] == 'G' and a[5] == 'C':
-                continue
-            outf.write(f"{a[0]}\t{int(a[3])-1}\t{a[3]}\t{a[1]}\n")
-    os.system(f"{bgzip} -c {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed > {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed.gz")
-    os.system(f"{tabix} -p bed {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed.gz")
-
-
-def _get_gvcf(bam, resultsdir, sample, gatk, ref_hs38, numThreads, bcftools, bgzip):
-    """
-    Convert BAM to a gvcf that contains only the 1kgp snps filtered above
-
-    Input files:
-        - f"{dir}bam/{sample}.bam"
-        - {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed
-
-        - {ref_hs38}
-
-    Output files:
-        - f"{dir}temp/{sample}.g.vcf"
-        - f"{dir}temp/{sample}.genotypes.vcf"
-        - f"{dir}temp/{sample}.rsIDs.gvcf.gz"
-
-    """
-    tmp_dir = f"{resultsdir}/temp"
-
-    output_g_path = f"{tmp_dir}/{sample}.ancestry.g.vcf"
-    if not Path(output_g_path).exists():
-        runstr = f"java -Xmx40g -Djava.io.tmpdir={tmp_dir}/tmpdir -jar {gatk}"
-        runstr += f" -T HaplotypeCaller -R {ref_hs38}"
-        runstr += f" -I {bam} -nct {numThreads}"
-        runstr += f" --emitRefConfidence GVCF -L {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed"
-        runstr += f" -o {output_g_path}"
-        os.system(runstr)
-
-    output_genotypes_path = f"{tmp_dir}/{sample}.ancestry.genotypes.vcf"
-    if not Path(output_genotypes_path).exists():
-        # return
-        runstr2 = f"java -Xmx40g -Djava.io.tmpdir={tmp_dir}/tmpdir -jar {gatk}"
-        runstr2 += f" -T GenotypeGVCFs -R {ref_hs38}"
-        runstr2 += f" --variant {output_g_path}"
-        runstr2 += f" -allSites -o {output_genotypes_path}"
-        runstr2 += " -stand_emit_conf 10 -stand_call_conf 30"
-        os.system(runstr2)
-
-    output_path = f"{tmp_dir}/{sample}.ancestry.rsIDs.gvcf.gz"
-    if not Path(output_path).exists():
-        os.system(f"cat {output_genotypes_path} | {bcftools} annotate -a {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed.gz -c CHROM,-,POS,ID | {bgzip} -c > {output_path}")
-
-
-def _gvcf_to_plink(plink, resultsdir, sample):
-    """
-    Convert gvcf to plink files (bed, bim, fam)
-
-    Input files:
-        - {tmp_dir}/{sample}.ancestry.rsIDs.gvcf.gz
-
-    Output files:
-        - {tmp_dir}/{sample}.bed
-        - {tmp_dir}/{sample}.bim
-        - {tmp_dir}/{sample}.fam
-        - {tmp_dir}/{sample}.fam.mod
-        - {tmp_dir}/{sample}.log (update)
-        - {tmp_dir}/{sample}.nosex
-
-    """
-    tmp_dir = f"{resultsdir}/temp"
-    output_path = f"{tmp_dir}/{sample}.fam"
-    if Path(output_path).exists():
-        return
-    # convert gvcf to plink (generate bed, bim, fam)
-    runstr1q = f"{plink} --biallelic-only --vcf-require-gt"
-    runstr1q += f" --vcf {tmp_dir}/{sample}.ancestry.rsIDs.gvcf.gz"
-    runstr1q += f" --out {tmp_dir}/{sample}"
-    runstr1q += " --allow-extra-chr --make-bed --double-id"
-    subprocess.run(runstr1q, shell=True)
-
-    # the vcf has the SN (sample name) from the BAM, so when converts that to plink it gives me problems if there is a "_" in the name.
-    # So I'm changin it and I put the same $sample name as FIID and IID.
-    # Also, if the IID is not the same as $sample, the R script will not find the sameple and will not be plotted
-    cmd = f"awk '{{ $1 = \"{sample}\"; print }}' {tmp_dir}/{sample}.fam > {tmp_dir}/{sample}.fam.mod"
-    subprocess.run(cmd, shell=True)
-    cmd2 = f"awk '{{ $2 = \"{sample}\"; print }}' {tmp_dir}/{sample}.fam.mod > {tmp_dir}/{sample}.fam"
-    subprocess.run(cmd2, shell=True)
-
-
-def _merge_pgp_1kGP(plink, resultsdir, sample, initialAnc):
-    """
-    Merge the pgp and 1kGP files and get the SNPs that disagree between datasets
-    - Four groups of files are generated:
-
-    Input files:
-        - {tmp_dir}/{sample}.bed
-        - {tmp_dir}/{sample}.bim
-        - {tmp_dir}/{sample}.fam
-        - {tmp_dir}/{sample}.fam.mod
-        - initialAnc f"{dir}reference/1kGP_GRCh38_exome"
-
-    Output files:
-        - {tmp_dir}/{sample}_1kGP_0 (-merge.missnp, -merge.fam, .log)
-        - {tmp_dir}/1kGP_2 (.bed, .bim, .fam, .log, .nosex)
-        - {tmp_dir}/{sample}_2 (.bed, .bim, .fam, .log, .nosex)
-        - {tmp_dir}/{sample}_1kGP (.bed, .bim, .fam, .log, .nosex)
-
-    """
-    tmp_dir = f"{resultsdir}/temp"
-    # output_path = f"{tmp_dir}/{sample}.fam"
-    # if Path(output_path).exists():
-    #     return
-    # Merge the pgp and 1kGP files and get the SNPs that disagree between datasets
-    output_path = f"{tmp_dir}/{sample}_1kGP_0-merge.fam"
-    if not Path(output_path).exists():
-        runstr3 = f"{plink} --bfile {tmp_dir}/{sample} --bmerge {initialAnc} --out {tmp_dir}/{sample}_1kGP_0 --geno 0 --allow-extra-chr --make-bed"
-        os.system(runstr3)
-    
-    # Remove discarded SNPs from the pgp and 1kGP file
-    output_path = f"{tmp_dir}/1kGP_2.bed"
-    if not Path(output_path).exists():
-        runstr1z = f"{plink} --bfile {initialAnc} --exclude {tmp_dir}/{sample}_1kGP_0-merge.missnp --out {tmp_dir}/1kGP_2 --make-bed --allow-extra-chr"
-        os.system(runstr1z)
-
-    output_path = f"{tmp_dir}/{sample}_2.bed"
-    if not Path(output_path).exists():
-        runstr1z2 = f"{plink} --bfile {tmp_dir}/{sample} --exclude {tmp_dir}/{sample}_1kGP_0-merge.missnp --out {tmp_dir}/{sample}_2 --make-bed --allow-extra-chr"
-        os.system(runstr1z2)
-    
-    # Merge the pgp and 1kGP files again
-    output_path = f"{tmp_dir}/{sample}_1kGP.bed"
-    if not Path(output_path).exists():
-        runstr3a = f"{plink} --bfile {tmp_dir}/{sample}_2 --bmerge {tmp_dir}/1kGP_2 --out {tmp_dir}/{sample}_1kGP --geno 0 --allow-extra-chr --make-bed"
-        os.system(runstr3a)
-
-
-def _subset_unlinked_snps(plink, resultsdir, sample):
-    """
-    Choose a subset of filtered and independent (unlinked) SNPs
-
-    Input files:
-        - {tmp_dir}/{sample}_1kGP*
-
-    Output files:
-        - {tmp_dir}/snps_to_prune (.log, .prune.in, .prune.out, .nosex)
-        - {tmp_dir}/{sample}_1kGP_pruned (.bed, .bim, .fam, .log, .nosex)
-
-    """
-    tmp_dir = f"{resultsdir}/temp"
-
-    # Choose a subset of filtered and independent (unlinked) SNPs
-
-    # Select a subgroup of unlinked SNPs, by pruning those with r2 > 0.1 using 100-SNP windows shifted at 5-SNP intervals.
-    output_path = f"{tmp_dir}/snps_to_prune.prune.in"
-    if not Path(output_path).exists():
-        runstr4 = f"{plink} --bfile {tmp_dir}/{sample}_1kGP --out {tmp_dir}/snps_to_prune --indep-pairwise 100 5 0.1"
-        os.system(runstr4)
-
-    # Remove the pruned SNPs from the dataset and set the missingness threshold to 0.1
-    output_path = f"{tmp_dir}/{sample}_1kGP_pruned.bed"
-    if not Path(output_path).exists():
-        runstr5 = f"{plink} --bfile {tmp_dir}/{sample}_1kGP --out {tmp_dir}/{sample}_1kGP_pruned --extract {tmp_dir}/snps_to_prune.prune.in --make-bed --mind 0.1"
-        os.system(runstr5)
-
-
-def ancestry_generator_from_BAM(bam, resultsdir="", numThreads=4,
-        dir_reference='reference', dir_software='software', workdir="", 
-    ):
-
-    dir = ""
-    if 'SINGULARITY_NAME' in os.environ:
-        dir = "/GenomeChronicler/"
-
-    # resultsdir = dir
-    plink = f"{dir}software/plink"
-    gatk = f"{dir}software/GenomeAnalysisTK.jar"
-    ref_hs38 = f"{dir}reference/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa"
-    initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim"
-    initialAnc = f"{dir}reference/1kGP_GRCh38_exome"
-    bcftools = "bcftools"
-    samtools = "samtools"
-    bgzip = "bgzip"
-    tabix = "tabix"
-
-    hasCHRflag = 0
-    numThreads = 4
-
-    # if not sys.argv:
-    #     print("Please provide a single base recalibrated BAM file to this script")
-    #     sys.exit(1)
-
-    # bam = sys.argv[0]
-    sample = basename(bam)
-    sample = sample.rsplit(".", 1)[0]
-    sample = sample.replace(".recal", "")
-    sample = sample.replace(".bam.clean", "")
-
-    # if len(sys.argv) > 1:
-    #     resultsdir = sys.argv[1]
-
-    # if len(sys.argv) > 2:
-    #     numThreads = sys.argv[2]
-
-    os.system(f"mkdir -p {resultsdir}/temp")
-
-    # Also check for the need to samtools index this stuff
-    # java -jar software/picard.jar CreateSequenceDictionary R=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa O=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.dict
-
-    # ------------------------------------------
-    # -- I had already preprocessed the 1kGP data to reduce it to a PLINK file with
-    # -- only SNPs from exome (914577 total)
-    # -- But from these, here I remove all the A/T or C/G SNPs to make sure that there
-    # -- will not be artifacts due to reporting SNPS i different stransd ib both datasets
-    # -- I also remove sexual chromosomes
-    # ------------------------------------------
-
-    # _get_list_of_snps_from_1kGP()
-    _get_list_of_snps_from_1kGP(initialBIM, sample, resultsdir, hasCHRflag, bgzip, tabix)
-
-    # ------------------------------------------
-    # -- Convert BAM to a gvcf that contains only
-    # - the 1kgp snps filtered above
-    # ------------------------------------------
-
-    # _get_gvcf()
-    _get_gvcf(bam, resultsdir, sample, gatk, ref_hs38, numThreads, bcftools, bgzip)
-
-    # ------------------------------------------
-    # -- Convert GVCF data to PLINK format
-    # ------------------------------------------
-
-    # _gvcf_to_plink()
-    _gvcf_to_plink(plink, resultsdir, sample)
-
-    # ------------------------------------------
-    # -- Merge gvcf with 1kGP data. I need to merge the files twice, the first one will fail if there are SNPs with
-    # -- different alleles in both datasets. The second merge will exclude these SNPs
-    # ------------------------------------------
-
-    # _merge_pgp_1kGP()
-    _merge_pgp_1kGP(plink, resultsdir, sample, initialAnc)
-
-    # ------------------------------------------
-    # -- Reduce the merged dataset further by removing SPs in LD
-    # ------------------------------------------
-
-    # _subset_unlinked_snps()
-    _subset_unlinked_snps(plink, resultsdir, sample)
-    # ------------------------------------------
-    # -- Perform PCA
-    # ------------------------------------------
-    tmp_dir = f"{resultsdir}/temp"
-    output_path = f"{tmp_dir}/{sample}_1kGP_pruned_pca_20.eigenvec"
-    if not Path(output_path).exists():
-        runstr6 = f"{plink}"
-        runstr6 += f" --bfile {tmp_dir}/{sample}_1kGP_pruned"
-        runstr6 += f" --out {tmp_dir}/{sample}_1kGP_pruned_pca_20"
-        runstr6 += " --pca"
-        subprocess.run(runstr6, shell=True)
-
-
-if __name__ == '__main__':
-    fire.Fire(ancestry_generator_from_BAM)
\ No newline at end of file
diff --git a/scripts/GenomeChronicler_ancestry_generator_fromVCF.py b/scripts/GenomeChronicler_ancestry_generator_fromVCF.py
deleted file mode 100755
index 5554115..0000000
--- a/scripts/GenomeChronicler_ancestry_generator_fromVCF.py
+++ /dev/null
@@ -1,229 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import sys
-import subprocess
-from os.path import basename
-
-from pathlib import Path
-import fire
-
-# get list of SNPs from 1kGP
-def _get_list_of_snps_from_1kGP(initialBIM, sample, resultsdir, hasCHRflag, bgzip, tabix):
-    output_path = f"{resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed"
-    if Path(output_path).exists():
-        return
-    with open(initialBIM, "r") as inf, open(output_path, "w") as outf:
-        outf.write("#CHROM\tST0\tPOS\tID\n")
-        for line in inf:
-            a = line.strip().split("\t")
-            if hasCHRflag == 1 and not a[0].startswith("chr"):
-                a[0] = "chr" + a[0]
-            pp = a[0]
-            pp = pp.replace('chr','')
-            if int(pp) > 22:
-                continue
-            if a[4] == 'T' and a[5] == 'A':
-                continue
-            if a[4] == 'A' and a[5] == 'T':
-                continue
-            if a[4] == 'C' and a[5] == 'G':
-                continue
-            if a[4] == 'G' and a[5] == 'C':
-                continue
-            outf.write(f"{a[0]}\t{int(a[3])-1}\t{a[3]}\t{a[1]}\n")
-    os.system(f"{bgzip} -c {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed > {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed.gz")
-    os.system(f"{tabix} -p bed {resultsdir}/temp/1kGP_GRCh38_nonAT_CG.bed.gz")
-
-
-# convert BAM to a gvcf that contains only the 1kgp snps filtered above
-def _get_gvcf(vcf, resultsdir, sample, bcftools, bgzip):
-    tmp_dir = f"{resultsdir}/temp"
-    # output_path = f"{tmp_dir}/{sample}.rsIDs.gvcf.gz"
-    # if Path(output_path).exists():
-    #     return
-    # runstr = f"java -Xmx40g -Djava.io.tmpdir={tmp_dir}/tmpdir -jar {gatk}"
-    # runstr += f" -T HaplotypeCaller -R {ref_hs38}"
-    # runstr += f" -I {bam} -nct {numThreads}"
-    # runstr += f" --emitRefConfidence GVCF -L {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed"
-    # runstr += f" -o {tmp_dir}/{sample}.g.vcf"
-    # os.system(runstr)
-    # runstr2 = f"java -Xmx40g -Djava.io.tmpdir={tmp_dir}/tmpdir -jar {gatk}"
-    # runstr2 += f" -T GenotypeGVCFs -R {ref_hs38}"
-    # runstr2 += f" --variant {tmp_dir}/{sample}.g.vcf"
-    # runstr2 += f" -allSites -o {tmp_dir}/{sample}.genotypes.vcf"
-    # runstr2 += " -stand_emit_conf 10 -stand_call_conf 30"
-    # os.system(runstr2)
-    os.system(f"cat {vcf} | {bcftools} annotate -a {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed.gz -c CHROM,-,POS,ID | {bgzip} -c > {tmp_dir}/{sample}.rsIDs.gvcf.gz")
-
-
-def _gvcf_to_plink(plink, resultsdir, sample):
-    tmp_dir = f"{resultsdir}/temp"
-    output_path = f"{tmp_dir}/{sample}.fam"
-    if Path(output_path).exists():
-        return
-    # convert gvcf to plink
-    runstr1q = f"{plink} --biallelic-only --vcf-require-gt"
-    runstr1q += f" --vcf {tmp_dir}/{sample}.rsIDs.gvcf.gz"
-    runstr1q += f" --out {tmp_dir}/{sample}"
-    runstr1q += " --allow-extra-chr --make-bed --double-id"
-    subprocess.run(runstr1q, shell=True)
-
-    # the vcf has the SN (sample name) from the BAM, so when converts that to plink it gives me problems if there is a "_" in the name.
-    # So I'm changin it and I put the same $sample name as FIID and IID.
-    # Also, if the IID is not the same as $sample, the R script will not find the sameple and will not be plotted
-    cmd = f"awk '{{ $1 = \"{sample}\"; print }}' {tmp_dir}/{sample}.fam > {tmp_dir}/{sample}.fam.mod"
-    subprocess.run(cmd, shell=True)
-    cmd2 = f"awk '{{ $2 = \"{sample}\"; print }}' {tmp_dir}/{sample}.fam.mod > {tmp_dir}/{sample}.fam"
-    subprocess.run(cmd2, shell=True)
-
-# def _gvcf_to_plink(plink, resultsdir, sample):
-#     tmp_dir = f"{resultsdir}/temp"
-#     # Convert gvcf to plink
-#     runstr1q = f"{plink} --biallelic-only --vcf-require-gt --vcf {tmp_dir}/{sample}.rsIDs.gvcf.gz --out {tmp_dir}/{sample} --allow-extra-chr --make-bed --double-id"
-#     os.system(runstr1q)
-    
-#     # Change sample names to avoid errors and inconsistencies
-#     cmd = f"awk '{{\$1 = \"{sample}\"; print}}' {tmp_dir}/{sample}.fam  > {tmp_dir}/{sample}.fam.mod"
-#     os.system(cmd)
-#     cmd2 = f"awk '{{\$2 = \"{sample}\"; print}}' {tmp_dir}/{sample}.fam.mod  > {tmp_dir}/{sample}.fam"
-#     os.system(cmd2)
-
-
-def _merge_pgp_1kGP(plink, resultsdir, sample, initialAnc):
-    tmp_dir = f"{resultsdir}/temp"
-    # output_path = f"{tmp_dir}/{sample}.fam"
-    # if Path(output_path).exists():
-    #     return
-    # Merge the pgp and 1kGP files and get the SNPs that disagree between datasets
-    runstr3 = f"{plink} --bfile {tmp_dir}/{sample} --bmerge {initialAnc} --out {tmp_dir}/{sample}_1kGP_0 --geno 0 --allow-extra-chr --make-bed"
-    os.system(runstr3)
-    
-    # Remove discarded SNPs from the pgp and 1kGP file
-    runstr1z = f"{plink} --bfile {initialAnc} --exclude {tmp_dir}/{sample}_1kGP_0-merge.missnp --out {tmp_dir}/1kGP_2 --make-bed --allow-extra-chr"
-    os.system(runstr1z)
-    runstr1z2 = f"{plink} --bfile {tmp_dir}/{sample} --exclude {tmp_dir}/{sample}_1kGP_0-merge.missnp --out {tmp_dir}/{sample}_2 --make-bed --allow-extra-chr"
-    os.system(runstr1z2)
-    
-    # Merge the pgp and 1kGP files again
-    runstr3a = f"{plink} --bfile {tmp_dir}/{sample}_2 --bmerge {tmp_dir}/1kGP_2 --out {tmp_dir}/{sample}_1kGP --geno 0 --allow-extra-chr --make-bed"
-    os.system(runstr3a)
-
-
-def _subset_unlinked_snps(plink, resultsdir, sample):
-    tmp_dir = f"{resultsdir}/temp"
-    output_path = f"{tmp_dir}/{sample}_1kGP_pruned.bed"
-    if Path(output_path).exists():
-        return
-    
-    # Choose a subset of filtered and independent (unlinked) SNPs
-    
-    # Select a subgroup of unlinked SNPs, by pruning those with r2 > 0.1 using 100-SNP windows shifted at 5-SNP intervals.
-    runstr4 = f"{plink} --bfile {tmp_dir}/{sample}_1kGP --out {tmp_dir}/snps_to_prune --indep-pairwise 100 5 0.1"
-    os.system(runstr4)
-    
-    # Remove the pruned SNPs from the dataset and set the missingness threshold to 0.1
-    runstr5 = f"{plink} --bfile {tmp_dir}/{sample}_1kGP --out {tmp_dir}/{sample}_1kGP_pruned --extract {tmp_dir}/snps_to_prune.prune.in --make-bed --mind 0.1"
-    os.system(runstr5)
-
-
-def ancestry_generator_from_VCF(vcf, sample, resultsdir=""):
-
-    dir = ""
-    if 'SINGULARITY_NAME' in os.environ:
-        dir = "/GenomeChronicler/"
-
-    # resultsdir = dir
-    plink = f"{dir}software/plink"
-    # gatk = f"{dir}software/GenomeAnalysisTK.jar"
-    # ref_hs38 = f"{dir}reference/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa"
-    initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim"
-    initialAnc = f"{dir}reference/1kGP_GRCh38_exome"
-    bcftools = "bcftools"
-    # samtools = "samtools"
-    bgzip = "bgzip"
-    tabix = "tabix"
-
-    hasCHRflag = 0
-    numThreads = 4
-
-    # if not sys.argv:
-    #     print("Please provide a single base recalibrated VCF file to this script")
-    #     sys.exit(1)
-
-    # VCF = sys.argv[0]
-    # sample = basename(vcf)
-    # sample = sample.rsplit(".", 1)[0]
-    # sample = sample.replace(".recal", "")
-    # sample = sample.replace(".bam.clean", "")
-
-    # if len(sys.argv) > 1:
-    #     resultsdir = sys.argv[1]
-
-    # if len(sys.argv) > 2:
-    #     numThreads = sys.argv[2]
-
-    os.system(f"mkdir -p {resultsdir}/temp")
-
-    # Also check for the need to samtools index this stuff
-    # java -jar software/picard.jar CreateSequenceDictionary R=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa O=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.dict
-
-    # ------------------------------------------
-    # -- I had already preprocessed the 1kGP data to reduce it to a PLINK file with
-    # -- only SNPs from exome (914577 total)
-    # -- But from these, here I remove all the A/T or C/G SNPs to make sure that there
-    # -- will not be artifacts due to reporting SNPS i different stransd ib both datasets
-    # -- I also remove sexual chromosomes
-    # ------------------------------------------
-
-    # _get_list_of_snps_from_1kGP()
-    _get_list_of_snps_from_1kGP(initialBIM, sample, resultsdir, hasCHRflag, bgzip, tabix)
-
-    # ------------------------------------------
-    # -- Convert BAM to a gvcf that contains only
-    # - the 1kgp snps filtered above
-    # ------------------------------------------
-
-    # _get_gvcf()
-    # _get_gvcf(vcf, resultsdir, sample, gatk, ref_hs38, numThreads, bcftools, bgzip)
-    _get_gvcf(vcf, resultsdir, sample, bcftools, bgzip)
-
-    # ------------------------------------------
-    # -- Convert GVCF data to PLINK format
-    # ------------------------------------------
-
-    # _gvcf_to_plink()
-    _gvcf_to_plink(plink, resultsdir, sample)
-
-    # ------------------------------------------
-    # -- Merge gvcf with 1kGP data. I need to merge the files twice, the first one will fail if there are SNPs with
-    # -- different alleles in both datasets. The second merge will exclude these SNPs
-    # ------------------------------------------
-
-    # _merge_pgp_1kGP()
-    _merge_pgp_1kGP(plink, resultsdir, sample, initialAnc)
-    # print('test')
-    # exit()
-
-    # ------------------------------------------
-    # -- Reduce the merged dataset further by removing SPs in LD
-    # ------------------------------------------
-
-    # _subset_unlinked_snps()
-    _subset_unlinked_snps(plink, resultsdir, sample)
-    # ------------------------------------------
-    # -- Perform PCA
-    # ------------------------------------------
-    tmp_dir = f"{resultsdir}/temp"
-    output_path = f"{tmp_dir}/{sample}_1kGP_pruned_pca_20.eigenvec"
-    if Path(output_path).exists():
-        return
-    runstr6 = f"{plink}"
-    runstr6 += f" --bfile {tmp_dir}/{sample}_1kGP_pruned"
-    runstr6 += f" --out {tmp_dir}/{sample}_1kGP_pruned_pca_20"
-    runstr6 += " --pca"
-    subprocess.run(runstr6, shell=True)
-
-
-if __name__ == '__main__':
-    fire.Fire(ancestry_generator_from_VCF)
\ No newline at end of file
diff --git a/scripts/GenomeChronicler_quickFilterFinalReportTables.py b/scripts/GenomeChronicler_quickFilterFinalReportTables.py
deleted file mode 100755
index 1376c8a..0000000
--- a/scripts/GenomeChronicler_quickFilterFinalReportTables.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env python3
-
-from pathlib import Path
-
-import pandas as pd
-import fire
-
-
-def filter_final_report_csv(csv_input_path, csv_output_path=None, DEBUG=False):
-    '''
-
-    '''
-    # read csv
-    df = pd.read_csv(csv_input_path,dtype={'Mag.':'str'})
-
-    # filter
-    if len(df.columns) <= 4:
-        if DEBUG:
-            print("WARNING: No links found")
-            return
-    df = df[~(df['Mag.'].astype('float')==0)]
-    df = df[~(df['Identifier'].str.startswith('{rs')|df['Identifier'].str.startswith('{i'))]
-
-    # export
-    if csv_output_path is None: # inplace
-        # warning, inplace operation
-        csv_output_path = csv_input_path
-    Path(csv_output_path).parent.mkdir(parents=True, exist_ok=True)
-    df.to_csv(csv_output_path, index=False)
-
-
-if __name__ == '__main__':
-    fire.Fire(filter_final_report_csv)
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/afogeno_generator.py b/scripts/afogeno_generator.py
new file mode 100755
index 0000000..9f01ebe
--- /dev/null
+++ b/scripts/afogeno_generator.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+import fire
+
+if __name__ == '__main__':
+    sys.path.append(os.path.abspath(os.getcwd()))
+from scripts.ancestry_generator import get_gvcf
+
+
+def build_genotype(a1, a2, gen):
+    alleles = [a1] + a2.split(",")
+    extras = [""] * len(alleles)
+    l1 = len(alleles[0])
+    extra = ""
+    for i in range(1, len(alleles)):
+        extras[i] = ""
+        l2 = len(alleles[i])
+        if l1 > 1 or l2 > 1:
+            if l2 > l1:
+                alleles[0] = "-"
+                alleles[i] = alleles[i][1:]
+                extras[i] = "I"
+            else:
+                alleles[0] = alleles[0][1:]
+                alleles[i] = "-"
+                extras[i] = "D"
+    res = [alleles[int(i)] for i in gen.split("/")]
+    extra = ";".join([extras[int(i)] for i in gen.split("/")]).strip(";")
+    return res, extra
+
+
+# def uniq(l):
+#     return list(set(l))
+
+def prepare_snps_ref_bed(ref_input_path, output_dir, bgzip="bgzip", tabix="tabix"):
+    """
+    Sort and compress BED file
+    - Sort by chromosome and position
+    - Compress with bgzip
+    - Index with tabix
+
+    Input: .bed
+    Output: .bed.gz, .bed.gz.tbi
+
+    Parameters
+    ----------
+    ref_input_path
+    output_dir
+    bgzip
+    tabix
+
+    Returns
+    -------
+
+    """
+    ref_input_path = Path(ref_input_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    ref_path = output_dir / f"{ref_input_path.name}"
+    gz_path = output_dir / f"{ref_input_path.name}.gz"
+    gz_tbi_path = output_dir / f"{ref_input_path.name}.gz.tbi"
+
+    # Sort and compress BED
+    if not gz_path.exists():
+        # subprocess.run(["sort", "-k", "1,1", "-k2,2n", str(ref_input_path), ">", str(ref_path)])
+        # subprocess.run(["mv", "tempAAA", ref_path])
+        # subprocess.run([bgzip, "-c", f"{ref_path}" + " > " + f"{gz_path}"])
+        cmd = f"sort -k 1,1 -k2,2n {ref_input_path} > {ref_path}"
+        subprocess.run(cmd, shell=True)
+        cmd = f"bgzip -c {ref_path} > {gz_path}"
+        subprocess.run(cmd, shell=True)
+
+    # Force tabix re-index
+    if not gz_tbi_path.exists():
+        # tabix index
+        # subprocess.run([tabix, "-p", "bed", f"{gz_path}"])
+        cmd = f"tabix -p bed {gz_path}"
+        subprocess.run(cmd, shell=True)
+
+def generate_afogeno_38(vcf_path, ref_bed, output_dir):
+    ## Generate AFO Geno 38 file
+    gen_data = {}
+    counter_template = {}
+
+    # with gzip.open(ref_bed, "rt") as IN:
+    with subprocess.Popen(["gzip", "-dcf", ref_bed], stdout=subprocess.PIPE).stdout as IN:
+        for line in IN:
+            line = line.strip()
+            if not line:
+                continue
+
+            # data = line.split(b"\t")
+            data = line.decode().split("\t")
+            # gen_data.setdefault(data[0], {})[int(data[2])] = data
+            gen_data[(data[0], data[2])] = data
+            counter_template[(data[0], data[2])] = 0
+
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # output_vcf_path = output_dir/f"afogeno.genotypes.rsIDs.vcf.gz"
+    afogeno_path = output_dir/f"afogeno38.txt"
+    debugCounter = {}
+
+    with subprocess.Popen(
+            f'gzip -dcf {vcf_path} | grep -v "0/0:0:0:0:0,0,0" | grep -v LowQual | cut -f 1,2,4,5,10 | uniq',
+            shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc1, \
+            open(afogeno_path, "w") as out_file:
+        # for line_bytes in proc1.stdout:
+        for line_bytes in proc1.stdout:
+            line = line_bytes.decode().rstrip()
+
+            if not line or line.startswith("#"):
+                continue
+
+            line = line.replace(",<NON_REF>", "")
+            data = line.split("\t")
+
+            tmp = data[4].split(":")
+            data[4] = tmp[0]
+
+            genotype = ["NA", "NA"]
+            extra = "NA"
+
+            pos = (data[0], data[1])
+
+            if data[4] != "./.":
+                a1, a2 = data[2], data[3]
+                genotype, extra = build_genotype(a1, a2, data[4])
+            else:
+                # No reads there
+                debugCounter[pos] = debugCounter.get(pos, 0) + 1
+                continue
+
+            data.extend(genotype)
+            data.append(extra)
+
+            # pos = (data[0], data[1])
+            if pos in gen_data:
+                d2 = gen_data[pos]
+                out_data = "\t".join(d2) + "\t" + "\t".join(data) + "\n"
+                out_file.write(out_data)
+            else:
+                debugCounter[pos] = debugCounter.get(pos, 0) + 1
+
+
+def generate_afogeno_pipeline(bam_or_vcf, output_dir="", numThreads=4,
+                              dir_reference='reference', dir_software='software', workdir="",
+                              ):
+    if 'SINGULARITY_NAME' in os.environ:
+        workdir = "/GenomeChronicler/"
+    workdir = Path(workdir)
+
+    # resultsdir = workdir
+    ref_bed = workdir / f"{dir_reference}/snps.19-114.unique.nochr.bed"
+    gatk = workdir / f"{dir_software}/GenomeAnalysisTK.jar"
+    ref_hs38 = workdir / f"{dir_reference}/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa"
+    bcftools = "bcftools"
+    bgzip = "bgzip"
+    tabix = "tabix"
+
+    output_dir = Path(output_dir)
+    plink_snps_dir = output_dir / 'prepare_snps_ref_bed'
+    prepare_snps_ref_bed(ref_bed, plink_snps_dir, bgzip, tabix)
+
+    ref_snps = plink_snps_dir/'snps.19-114.unique.nochr.bed'
+    vcf_dir = output_dir/'vcf_afogeno'
+    get_gvcf(bam_or_vcf, vcf_dir, ref_hs38, ref_snps, 'afogeno', gatk, numThreads, bcftools, bgzip)
+
+    ## Generate AFO Geno 38 file
+    vcf_afogeno_path = vcf_dir / f'afogeno.rsIDs.gvcf.gz'
+    generate_afogeno_38(vcf_afogeno_path, ref_bed, output_dir)
+
+
+funcs = {
+    'prepare_snps_ref_bed': prepare_snps_ref_bed,
+    'generate_afogeno_38': generate_afogeno_38,
+    'generate_afogeno_pipeline': generate_afogeno_pipeline,
+}
+
+
+class Tools(object):
+    def __init__(self):
+        super(Tools, self).__init__()
+
+
+if __name__ == '__main__':
+    for k, v in funcs.items():
+        setattr(Tools, k, staticmethod(v))
+    fire.Fire(Tools)
diff --git a/scripts/ancestry_generator.py b/scripts/ancestry_generator.py
new file mode 100755
index 0000000..e4cc1dd
--- /dev/null
+++ b/scripts/ancestry_generator.py
@@ -0,0 +1,350 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+from pathlib import Path
+
+import fire
+
+
+def get_list_of_snps_from_1kGP(initialBIM, output_dir, hasCHRflag=0, bgzip='bgzip', tabix='tabix'):
+    """
+    Get list of SNPs from 1kGP
+
+    Input files:
+        - initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim"
+    Output files:
+        - f"{dir}temp/1kGP_GRCh38_nonAT_CG.bed.gz"
+        - f"{dir}temp/1kGP_GRCh38_nonAT_CG.bed.gz.tbi"
+
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir/f"1kGP_GRCh38_nonAT_CG.bed"
+    if Path(output_path).exists():
+        return
+    with open(initialBIM, "r") as inf, open(output_path, "w") as outf:
+        outf.write("#CHROM\tST0\tPOS\tID\n")
+        for line in inf:
+            a = line.strip().split("\t")
+            if hasCHRflag == 1 and not a[0].startswith("chr"):
+                a[0] = "chr" + a[0]
+            pp = a[0]
+            pp = pp.replace('chr','')
+            if int(pp) > 22:
+                continue
+            if a[4] == 'T' and a[5] == 'A':
+                continue
+            if a[4] == 'A' and a[5] == 'T':
+                continue
+            if a[4] == 'C' and a[5] == 'G':
+                continue
+            if a[4] == 'G' and a[5] == 'C':
+                continue
+            outf.write(f"{a[0]}\t{int(a[3])-1}\t{a[3]}\t{a[1]}\n")
+    os.system(f"{bgzip} -c {output_dir}/1kGP_GRCh38_nonAT_CG.bed > {output_dir}/1kGP_GRCh38_nonAT_CG.bed.gz")
+    os.system(f"{tabix} -p bed {output_dir}/1kGP_GRCh38_nonAT_CG.bed.gz")
+
+
+def get_gvcf(vcf_or_bam, output_dir, ref_hs38, ref_1kGP, name='ancestry',
+             gatk='software/GenomeAnalysisTK.jar', numThreads=4, bcftools='bcftools', bgzip='bgzip'):
+    """
+    Convert BAM to a gvcf that contains only the 1kgp snps filtered above
+
+    Input files:
+        - f"{dir}bam/clean.bam"
+
+        - {ref_hs38} reference/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa
+        - {ref_1kGP} {tmp_dir}/1kGP_GRCh38_nonAT_CG.bed
+
+    Output files:
+        - f"{dir}temp/g.vcf"
+        - f"{dir}temp/genotypes.vcf"
+        - f"{dir}temp/rsIDs.gvcf.gz"
+
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    ref_1kGP = Path(ref_1kGP)
+    ref_1kGP_gz = f"{ref_1kGP}.gz"
+
+    if str(vcf_or_bam).lower().endswith('.bam'):
+        # Define the command to run GATK HaplotypeCaller
+        output_g_path = f"{output_dir}/{name}.g.vcf"
+        if not Path(output_g_path).exists():
+            runstr = f"java -Xmx40g -Djava.io.tmpdir={output_dir}/tmpdir -jar {gatk}"
+            runstr += f" -T HaplotypeCaller -R {ref_hs38}"
+            runstr += f" -I {vcf_or_bam} -nct {numThreads}"
+            runstr += f" --emitRefConfidence GVCF -L {ref_1kGP}"
+            runstr += f" -o {output_g_path}"
+            os.system(runstr)
+
+        # Define the command to run GATK GenotypeGVCFs
+        output_genotypes_path = f"{output_dir}/{name}.genotypes.vcf"
+        if not Path(output_genotypes_path).exists():
+            # return
+            runstr2 = f"java -Xmx40g -Djava.io.tmpdir={output_dir}/tmpdir -jar {gatk}"
+            runstr2 += f" -T GenotypeGVCFs -R {ref_hs38}"
+            runstr2 += f" --variant {output_g_path}"
+            runstr2 += f" -allSites -o {output_genotypes_path}"
+            runstr2 += " -stand_emit_conf 10 -stand_call_conf 30"
+            os.system(runstr2)
+    else:
+        output_genotypes_path = vcf_or_bam
+
+    # Add rsIDs to the VCF file using bcftools and bgzip
+    output_path = f"{output_dir}/{name}.rsIDs.gvcf.gz"
+    if not Path(output_path).exists():
+        cmd = f"cat {output_genotypes_path} | {bcftools} annotate -a {ref_1kGP_gz} -c CHROM,-,POS,ID | {bgzip} -c > {output_path}"
+        ret = subprocess.run(cmd, shell=True)
+
+
+def gvcf_to_plink(gvcf_path, output_dir, output_name='sample', plink='software/plink'):
+    """
+    Convert gvcf to plink files (bed, bim, fam)
+
+    Input files:
+        - {output_dir}/{sample}.ancestry.rsIDs.gvcf.gz
+
+    Output files:
+        - {output_dir}/{sample}.bed
+        - {output_dir}/{sample}.bim
+        - {output_dir}/{sample}.fam
+        - {output_dir}/{sample}.fam.mod
+        - {output_dir}/{sample}.log (update)
+        - {output_dir}/{sample}.nosex
+
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+    output_path = f"{output_dir}/{output_name}.fam"
+    if Path(output_path).exists():
+        return
+
+    # convert gvcf to plink (generate bed, bim, fam)
+    runstr1q = f"{plink} --biallelic-only --vcf-require-gt"
+    runstr1q += f" --vcf {gvcf_path}"
+    runstr1q += f" --out {output_dir}/{output_name}"
+    runstr1q += " --allow-extra-chr --make-bed --double-id"
+    subprocess.run(runstr1q, shell=True)
+
+    # the vcf has the SN (sample name) from the BAM, so when converts that to plink it gives me problems if there is a "_" in the name.
+    # So I'm changin it and I put the same $sample name as FIID and IID.
+    # Also, if the IID is not the same as $sample, the R script will not find the sameple and will not be plotted
+    cmd = f"awk '{{ $1 = \"{output_name}\"; print }}' {output_dir}/{output_name}.fam > {output_dir}/{output_name}.fam.mod"
+    subprocess.run(cmd, shell=True)
+    cmd2 = f"awk '{{ $2 = \"{output_name}\"; print }}' {output_dir}/{output_name}.fam.mod > {output_dir}/{output_name}.fam"
+    subprocess.run(cmd2, shell=True)
+
+
+def merge_pgp_1kGP(plink_result_dir, output_dir, ref_initialAnc='reference/1kGP_GRCh38_exome', plink='software/plink'):
+    """
+    Merge the pgp and 1kGP files and get the SNPs that disagree between datasets
+    - Four groups of files are generated:
+
+    Input files:
+        - {plink_result_dir}/{sample}.bed
+        - {plink_result_dir}/{sample}.bim
+        - {plink_result_dir}/{sample}.fam
+        - {plink_result_dir}/{sample}.fam.mod
+        - initialAnc f"{dir}reference/1kGP_GRCh38_exome"
+
+    Output files:
+        - {resultsdir}/{sample}_1kGP_0 (-merge.missnp, -merge.fam, .log)
+        - {resultsdir}/1kGP_2 (.bed, .bim, .fam, .log, .nosex)
+        - {resultsdir}/{sample}_2 (.bed, .bim, .fam, .log, .nosex)
+        - {resultsdir}/{sample}_1kGP (.bed, .bim, .fam, .log, .nosex)
+
+    """
+    # reusltsdir = f"{resultsdir}/temp"
+    # output_path = f"{reusltsdir}/{sample}.fam"
+    # if Path(output_path).exists():
+    #     return
+    # Merge the pgp and 1kGP files and get the SNPs that disagree between datasets
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    output_path = f"{output_dir}/1kGP_0-merge.fam"
+    if not Path(output_path).exists():
+        runstr3 = f"{plink} --bfile {plink_result_dir} --bmerge {ref_initialAnc} --out {output_dir}/1kGP_0 --geno 0 --allow-extra-chr --make-bed"
+        os.system(runstr3)
+    
+    # Remove discarded SNPs from the pgp and 1kGP file
+    output_path = f"{output_dir}/1kGP_2.bed"
+    if not Path(output_path).exists():
+        runstr1z = f"{plink} --bfile {ref_initialAnc} --exclude {output_dir}/1kGP_0-merge.missnp --out {output_dir}/1kGP_2 --make-bed --allow-extra-chr"
+        os.system(runstr1z)
+
+    output_path = f"{output_dir}/2.bed"
+    if not Path(output_path).exists():
+        runstr1z2 = f"{plink} --bfile {plink_result_dir} --exclude {output_dir}/1kGP_0-merge.missnp --out {output_dir}/2 --make-bed --allow-extra-chr"
+        os.system(runstr1z2)
+    
+    # Merge the pgp and 1kGP files again
+    output_path = f"{output_dir}/1kGP.bed"
+    if not Path(output_path).exists():
+        runstr3a = f"{plink} --bfile {output_dir}/2 --bmerge {output_dir}/1kGP_2 --out {output_dir}/1kGP --geno 0 --allow-extra-chr --make-bed"
+        os.system(runstr3a)
+
+
+def subset_unlinked_snps(plink_1kgp_path, output_dir, plink='software/plink'):
+    """
+    Choose a subset of filtered and independent (unlinked) SNPs
+
+    Input files:
+        - {output_dir}/{sample}_1kGP* (.bed, .bim, .fam, .log, .nosex)
+
+    Output files:
+        - {output_dir}/snps_to_prune (.log, .prune.in, .prune.out, .nosex)
+        - {output_dir}/{sample}_1kGP_pruned (.bed, .bim, .fam, .log, .nosex)
+
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    # output_dir = f"{resultsdir}/temp"
+
+    # Choose a subset of filtered and independent (unlinked) SNPs
+
+    # Select a subgroup of unlinked SNPs, by pruning those with r2 > 0.1 using 100-SNP windows shifted at 5-SNP intervals.
+    output_path = f"{output_dir}/snps_to_prune.prune.in"
+    if not Path(output_path).exists():
+        runstr4 = f"{plink} --bfile {plink_1kgp_path} --out {output_dir}/snps_to_prune --indep-pairwise 100 5 0.1"
+        os.system(runstr4)
+
+    # Remove the pruned SNPs from the dataset and set the missingness threshold to 0.1
+    output_path = f"{output_dir}/1kGP_pruned.bed"
+    if not Path(output_path).exists():
+        runstr5 = f"{plink} --bfile {plink_1kgp_path} --out {output_dir}/1kGP_pruned --extract {output_dir}/snps_to_prune.prune.in --make-bed --mind 0.1"
+        os.system(runstr5)
+
+def from_1kgp_pruned_to_pca(plink_1kgp_pruned, output_dir, plink='software/plink'):
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    output_path = f"{output_dir}/1kGP_pruned_pca_20.eigenvec"
+    if not Path(output_path).exists():
+        runstr6 = f"{plink}"
+        runstr6 += f" --bfile {plink_1kgp_pruned}"
+        runstr6 += f" --out {output_dir}/1kGP_pruned_pca_20"
+        runstr6 += " --pca"
+        subprocess.run(runstr6, shell=True)
+
+def generate_ancestry_pipeline(bam_or_vcf, output_dir, numThreads=4, sample='sample',
+                               dir_reference='reference', dir_software='software', workdir="",
+                               ):
+    """
+    Generate ancestry from BAM file
+
+    Steps:
+        1. preprocessed the 1kGP data
+        2. Convert BAM to VCF
+        3. Convert VCF to PLINK
+        4. Merge the pgp and 1kGP files and get the SNPs that disagree between datasets
+        5. Choose a subset of filtered and independent (unlinked) SNPs
+        6. Run ADMIXTURE
+
+
+    Parameters
+    ----------
+    bam
+    output_dir
+    numThreads
+    dir_reference
+    dir_software
+    workdir
+
+    Returns
+    -------
+
+    """
+
+    dir = ""
+    if 'SINGULARITY_NAME' in os.environ:
+        dir = "/GenomeChronicler/"
+
+    # resultsdir = dir
+    plink = f"{dir}software/plink"
+    gatk = f"{dir}software/GenomeAnalysisTK.jar"
+    ref_hs38 = f"{dir}reference/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa"
+    initialBIM = f"{dir}reference/1kGP_GRCh38_exome.bim"
+    initialAnc = f"{dir}reference/1kGP_GRCh38_exome"
+    bcftools = "bcftools"
+    bgzip = "bgzip"
+    tabix = "tabix"
+    hasCHRflag = 0
+
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    # Also check for the need to samtools index this stuff
+    # java -jar software/picard.jar CreateSequenceDictionary R=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.fa O=software/GRCh38_full_analysis_set_plus_decoy_hla_noChr.dict
+
+    # ------------------------------------------
+    # -- I had already preprocessed the 1kGP data to reduce it to a PLINK file with
+    # -- only SNPs from exome (914577 total)
+    # -- But from these, here I remove all the A/T or C/G SNPs to make sure that there
+    # -- will not be artifacts due to reporting SNPS i different stransd ib both datasets
+    # -- I also remove sexual chromosomes
+    # ------------------------------------------
+    plink_1kgp_dir = output_dir/'1kGP_GRCh38_nonAT_CG'
+    get_list_of_snps_from_1kGP(initialBIM, plink_1kgp_dir, hasCHRflag=hasCHRflag, bgzip=bgzip, tabix=tabix)
+
+    # ------------------------------------------
+    # -- Convert BAM to a gvcf that contains only
+    # - the 1kgp snps filtered above
+    # ------------------------------------------
+    ref_1kGP = plink_1kgp_dir/'1kGP_GRCh38_nonAT_CG.bed'
+    vcf_dir = output_dir/'vcf_ancestry'
+    get_gvcf(bam_or_vcf, vcf_dir, ref_hs38, ref_1kGP, 'ancestry', gatk, numThreads, bcftools, bgzip)
+
+    # ------------------------------------------
+    # -- Convert GVCF data to PLINK format
+    # ------------------------------------------
+    gvcf_path = vcf_dir/f"ancestry.rsIDs.gvcf.gz"
+    gvcf_to_plink(gvcf_path, vcf_dir, sample, plink)
+
+    # ------------------------------------------
+    # -- Merge gvcf with 1kGP data. I need to merge the files twice, the first one will fail if there are SNPs with
+    # -- different alleles in both datasets. The second merge will exclude these SNPs
+    # ------------------------------------------
+    plink_result_dir = vcf_dir/sample
+    merge_pgp_1kgp_dir = output_dir/'merged_pgp_1kGP'
+    merge_pgp_1kGP(plink_result_dir, merge_pgp_1kgp_dir, initialAnc, plink)
+
+    # ------------------------------------------
+    # -- Reduce the merged dataset further by removing SPs in LD
+    # ------------------------------------------
+    merge_pgp_1kgp_path = merge_pgp_1kgp_dir/f"1kGP"
+    subset_unlinked_snps_dir = output_dir/'subset_unlinked_snps'
+    subset_unlinked_snps(merge_pgp_1kgp_path, subset_unlinked_snps_dir, plink)
+
+    # ------------------------------------------
+    # -- Perform PCA
+    # ------------------------------------------
+    plink_result_dir = subset_unlinked_snps_dir/f"1kGP_pruned"
+    pca_dir = output_dir/'pca_ancestry'
+    from_1kgp_pruned_to_pca(plink_result_dir, pca_dir, plink)
+
+
+funcs = {
+    'get_list_of_snps_from_1kGP': get_list_of_snps_from_1kGP,
+    'get_gvcf': get_gvcf,
+    'gvcf_to_plink': gvcf_to_plink,
+    'merge_pgp_1kGP': merge_pgp_1kGP,
+    'subset_unlinked_snps': subset_unlinked_snps,
+    'from_1kgp_pruned_to_pca': from_1kgp_pruned_to_pca,
+    'generate_ancestry_pipeline': generate_ancestry_pipeline,
+}
+
+
+class Tools(object):
+    def __init__(self):
+        super(Tools, self).__init__()
+
+
+if __name__ == '__main__':
+    for k, v in funcs.items():
+        setattr(Tools, k, staticmethod(v))
+    fire.Fire(Tools)
diff --git a/scripts/GenomeChronicler_genoTables_fromAfoGeno.py b/scripts/genotables_from_afogeno.py
similarity index 80%
rename from scripts/GenomeChronicler_genoTables_fromAfoGeno.py
rename to scripts/genotables_from_afogeno.py
index c55efbd..2de75b5 100755
--- a/scripts/GenomeChronicler_genoTables_fromAfoGeno.py
+++ b/scripts/genotables_from_afogeno.py
@@ -8,11 +8,11 @@
 import pandas as pd
 from tqdm import tqdm
 
-def ucfirst(s):
-    if len(s) > 0:
-        return s[0].upper() + s[1:]
-    else:
-        return s
+if __name__ == '__main__':
+    import sys
+    sys.path.append(os.path.abspath(os.getcwd()))
+from scripts.utils import ucfirst
+
 
 def processGenoset(genoset, genosets, genotypes, positiveGenosets):
     # example: logic = ((( $v[0] + $v[1] + $v[2] + $v[3] + $v[4] ) >= 1) and (!$v[5] and !$v[6] and !$v[7] and !$v[8] and !$v[9]))
@@ -140,40 +140,62 @@ def generate_ClinVar_url(rsid, sth5, sth5_sql):
         return ""
 
 
-def geno_tables_from_afogeno(filename, outdir='./', verbose=False):
-    version = "22-146"
+def geno_tables_from_afogeno(afogeno_path, output_dir, verbose=False, version="22-146", reference_dir="reference"):
+    """
+    This function generates the geno tables from the afogeno output.
+
+    Inputs:
+        afogeno_path: path to the afogeno output
+        reference_dir: path to the reference directory
+            snpedia.db
+            gnomad.db
+            getevidence.db
+            clinvar.db
+            genosetDependencies.txt
+            parsedGenosets.txt
+    Outputs:
+        latest.good.reportTable.csv"
+        latest.bad.reportTable.csv
+        latest.genoset.reportTable.csv
+
+    Parameters
+    ----------
+    afogeno_path
+    output_dir
+    verbose
+    version
+    reference_dir
+
+    Returns
+    -------
+
+    """
+    # dir = ""
+    # if "SINGULARITY_NAME" in os.environ:
+    #     dir = "/GenomeChronicler/"
+
+    reference_dir = Path(reference_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    temp_dir = output_dir/"temp"
+    temp_dir.mkdir(parents=True, exist_ok=True)
 
     # Note: It is quite clear this code can be optimised
     tmpBlacklist = {"rs10156191": 1, "rs12094543": 1, "rs1667255": 1, "rs17672135": 1, "rs6445975": 1, "rs7659604": 1}
 
     print(f"This is script [{__file__}] part of the report generator pipeline version [{version}];\n")
 
-    Path(outdir).mkdir(parents=True, exist_ok=True)
-    # Sort out input
-    # if len(sys.argv) != 3:
-    #     print("Usage: {0} MassagedGenotypeVCF OutputDirectory".format(sys.argv[0]))
-    #     sys.exit(1)
-
-    # filename = sys.argv[1]
-    # outdir = "./"
-    # if len(sys.argv) > 2:
-    #     outdir = sys.argv[2]
-
-    dir = ""
-    if "SINGULARITY_NAME" in os.environ:
-        dir = "/GenomeChronicler/"
-
     # Connect to databases
-    snpedia_db = sqlite3.connect(f"{dir}reference/snpedia.db")
+    snpedia_db = sqlite3.connect(reference_dir/f"snpedia.db")
     print("Opened database successfully [SNPedia]")
 
-    gnomad_db = sqlite3.connect(f"{dir}reference/gnomad.db")
+    gnomad_db = sqlite3.connect(reference_dir/f"gnomad.db")
     print("Opened database successfully [GnomAD]")
 
-    getevidence_db = sqlite3.connect(f"{dir}reference/getevidence.db")
+    getevidence_db = sqlite3.connect(reference_dir/f"getevidence.db")
     print("Opened database successfully [GetEvidence]")
 
-    clinvar_db = sqlite3.connect(f"{dir}reference/clinvar.db")
+    clinvar_db = sqlite3.connect(reference_dir/f"clinvar.db")
     print("Opened database successfully [ClinVar]")
 
     # Prepare statements
@@ -204,7 +226,7 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False):
 
     # Input Genoset Data
     allGenosets = []
-    with open(f"{dir}reference/genosetDependencies.txt", "r") as genoset_deps_file:
+    with open(reference_dir/f"genosetDependencies.txt", "r") as genoset_deps_file:
         for line in genoset_deps_file:
             line = line.strip()
             if not line:
@@ -213,7 +235,7 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False):
 
     genosets = {}
     genotypes = {}
-    with open(f"{dir}reference/parsedGenosets.txt", "r") as parsed_genosets_file:
+    with open(reference_dir/f"parsedGenosets.txt", "r") as parsed_genosets_file:
         for line in parsed_genosets_file:
             line = line.strip()
             if not line:
@@ -225,15 +247,15 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False):
     # output_path = f'{outdir}/latest.good.reportTable.csv'
     # if not Path(output_path).exists():
     if True:
-        IN = open(filename)
+        IN = open(afogeno_path)
         rows_GOOD = []
         rows_BAD = []
-        Path(f"{outdir}/latest.good.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n")
-        Path(f"{outdir}/latest.bad.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n")
-        proc_good = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {outdir}/latest.good.reportTable.csv", 
-                shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
-        proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {outdir}/latest.bad.reportTable.csv", 
-                shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+        Path(f"{temp_dir}/latest.good.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n")
+        Path(f"{temp_dir}/latest.bad.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n")
+        proc_good = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.good.reportTable.csv",
+                                     shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+        proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.bad.reportTable.csv",
+                                    shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
     
         for line in tqdm(IN, disable=not verbose):
             # for line in IN:
@@ -388,10 +410,10 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False):
 
     print("\t +++ POSITIVES:\n", positiveGenosets)
 
-    with open(f"{outdir}/latest.genoset.reportTable.csv", "w") as geno:
+    with open(f"{output_dir}/latest.genoset.reportTable.csv", "w") as geno:
         geno.write("Magnitude,Identifier,Summary\n")
 
-    sort_process = subprocess.Popen(f"sort -t \",\" -k 1,1nr >> {outdir}/latest.genoset.reportTable.csv", stdin=subprocess.PIPE, shell=True)
+    sort_process = subprocess.Popen(f"sort -t \",\" -k 1,1nr >> {output_dir}/latest.genoset.reportTable.csv", stdin=subprocess.PIPE, shell=True)
     # proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {outdir}/latest.bad.reportTable.csv", 
     #             shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
     for geno in sorted(positiveGenosets.keys()):
@@ -409,6 +431,46 @@ def geno_tables_from_afogeno(filename, outdir='./', verbose=False):
     print("Operation done successfully")
     snpedia_db.close()
 
+    filter_final_report_csv(f"{temp_dir}/latest.good.reportTable.csv", f"{output_dir}/latest.good.reportTable.csv")
+    filter_final_report_csv(f"{temp_dir}/latest.bad.reportTable.csv", f"{output_dir}/latest.bad.reportTable.csv")
+
+
+def filter_final_report_csv(csv_input_path, csv_output_path=None, DEBUG=False):
+    '''
+
+    '''
+    # read csv
+    df = pd.read_csv(csv_input_path,dtype={'Mag.':'str'})
+
+    # filter
+    if len(df.columns) <= 4:
+        if DEBUG:
+            print("WARNING: No links found")
+            return
+    df = df[~(df['Mag.'].astype('float')==0)]
+    df = df[~(df['Identifier'].str.startswith('{rs')|df['Identifier'].str.startswith('{i'))]
+
+    # export
+    if csv_output_path is None: # inplace
+        # warning, inplace operation
+        csv_output_path = csv_input_path
+    Path(csv_output_path).parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(csv_output_path, index=False)
+
+
+
+funcs = {
+    'geno_tables_from_afogeno': geno_tables_from_afogeno,
+    'filter_final_report_csv': filter_final_report_csv,
+}
+
+
+class Tools(object):
+    def __init__(self):
+        super(Tools, self).__init__()
+
 
 if __name__ == '__main__':
-    fire.Fire(geno_tables_from_afogeno)
+    for k, v in funcs.items():
+        setattr(Tools, k, staticmethod(v))
+    fire.Fire(Tools)
diff --git a/scripts/GenomeChronicler_XLSX_fromTables.py b/scripts/io.py
similarity index 51%
rename from scripts/GenomeChronicler_XLSX_fromTables.py
rename to scripts/io.py
index 40365bf..50b0428 100644
--- a/scripts/GenomeChronicler_XLSX_fromTables.py
+++ b/scripts/io.py
@@ -1,17 +1,17 @@
 #!/usr/bin/env python3
 
-# import sys
-# import os
 import csv
+import os
+import shutil
+import subprocess
 from pathlib import Path
 
-# import pandas as pd
 import fire
 import xlsxwriter
 
 
 def export_genotypes_xlsx_from_csvs(csv_dir, xlsx_output_path,
-        csv_path_good='latest.good.reportTable.csv', 
+        csv_path_good='latest.good.reportTable.csv',
         csv_path_bad='latest.bad.reportTable.csv', 
         csv_path_genoset='latest.genoset.reportTable.csv',
     ):
@@ -101,5 +101,105 @@ def export_genotypes_xlsx_from_csvs(csv_dir, xlsx_output_path,
 
     workbook.close()
 
+def render_latex(tables_dir, sample_name, output_dir,
+                 plot_path=None, vep_dir=None,
+                 template_dir="templates", latex_template=None,
+                 ):
+    """
+    Render the latex template with the tables and the version table
+
+    Template
+    -
+
+    Latex render files:
+        Common:
+            latexTemplate.tex
+            versionTable.txt
+            GeneStructure.pdf
+
+        Personal:
+            SampleName.txt
+            latest.good.reportTable.csv
+            latest.bad.reportTable.csv
+            latest.genoset.reportTable.csv
+
+        with VEP:
+            latest.summary.csv
+            latest.polyphen_table.csv
+            latest.var_class_table.csv
+            latest.var_cons_table.csv
+
+    Parameters
+    ----------
+    tables_dir
+    sample_name
+    output_dir
+
+    Returns
+    -------
+
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    template_dir = Path(template_dir)
+
+    if latex_template is None:
+        if vep_dir is not None:
+            latex_template = template_dir/f"reportTemplate_withVEP.tex"
+        else:
+            latex_template = template_dir/f"reportTemplate_ohneVEP.tex"
+
+    # Prepare files for latex rendering
+    ## Main file
+    latex_main_name = f"{sample_name}_report"
+    shutil.copy(latex_template, f"{output_dir}/{latex_main_name}.tex")
+
+    ## Common files
+    shutil.copy(template_dir/f"versionTable.txt", output_dir)
+    shutil.copy(template_dir/f"GeneStructure.pdf", output_dir)
+
+    ## Personal files
+    with open(f"{output_dir}/SampleName.txt", "w") as f:
+        f.write(sample_name)
+    tables_dir = Path(tables_dir)
+    shutil.copy(tables_dir/f"latest.good.reportTable.csv", output_dir)
+    shutil.copy(tables_dir/f"latest.bad.reportTable.csv", output_dir)
+    shutil.copy(tables_dir/f"latest.genoset.reportTable.csv", output_dir)
+    if plot_path is not None:
+        shutil.copy(plot_path, output_dir/'AncestryPlot.pdf')
+
+    ## VEP files
+    if vep_dir is not None:
+        vep_dir = Path(vep_dir)
+        shutil.copy(vep_dir/f"latest.summary.csv", output_dir)
+        shutil.copy(vep_dir/f"latest.polyphen_table.csv", output_dir)
+        shutil.copy(vep_dir/f"latest.var_class_table.csv", output_dir)
+        shutil.copy(vep_dir/f"latest.var_cons_table.csv", output_dir)
+
+
+    # Render latex
+    cwd = os.getcwd()
+    os.chdir(f"{output_dir}")
+    for _ in range(3):
+        cmd = f"pdflatex -interaction=nonstopmode {latex_main_name}.tex 2> /dev/null > /dev/null"
+        ret = subprocess.run(cmd, shell=True)
+    os.chdir(cwd)
+
+
+funcs = {
+    'export_genotypes_xlsx_from_csvs': export_genotypes_xlsx_from_csvs,
+    'render_latex': render_latex,
+}
+
+
+class Tools(object):
+    def __init__(self):
+        super(Tools, self).__init__()
+
+
 if __name__ == '__main__':
-    fire.Fire(export_genotypes_xlsx_from_csvs)
+    for k, v in funcs.items():
+        setattr(Tools, k, staticmethod(v))
+    fire.Fire(Tools)
+
+
diff --git a/scripts/GenomeChronicler_plot_generator_fromAncestry.R b/scripts/plot_generator_fromAncestry.R
similarity index 97%
rename from scripts/GenomeChronicler_plot_generator_fromAncestry.R
rename to scripts/plot_generator_fromAncestry.R
index b60ebae..1fcb719 100755
--- a/scripts/GenomeChronicler_plot_generator_fromAncestry.R
+++ b/scripts/plot_generator_fromAncestry.R
@@ -1,11 +1,11 @@
 library(RColorBrewer)
 library(TeachingDemos)
 
-sample <- as.character(Sys.getenv("SAMPLE"))
+pca_path <- as.character(Sys.getenv("PCA_PATH"))
 id <- as.character(Sys.getenv("ID"))
-dir <- as.character(Sys.getenv("DIR"))
+output_dir <- as.character(Sys.getenv("OUTPUT_DIR"))
 
-data <- read.table(paste0(dir,"/results/results_",sample,"/temp/",sample, "_1kGP_pruned_pca_20.eigenvec"), header=F)
+data <- read.table(pca_path, header=F)
 
 qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
 mypalette = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
@@ -156,7 +156,7 @@ for(i in 1:length(zoom_data[,1])){
 }
 
 #pdf(paste0("./results/results_",sample,"/",sample, "_ancestry_pca.pdf"))
-pdf(paste0(dir,"/results/results_",sample,"/AncestryPlot.pdf"))
+pdf(paste0(output_dir,"/AncestryPlot.pdf"))
 
 plot(data[, 3] + 0.5 * data[, 4] - 0.5 * data[, 5], sin(pi / 3) * data[,4] + sin(pi / 3) * data[, 5], col = all.colors, pch = 19, cex = 0.3, xaxt='n', yaxt='n', xlab="", ylab="", main=paste0("Ancestry ",id))
 points(data[pgp, 3] + 0.5 * data[pgp, 4] - 0.5 * data[pgp, 5], sin(pi / 3) * data[pgp,4] + sin(pi / 3) * data[pgp, 5], col = "black", pch = 16, cex = 1)
@@ -181,5 +181,3 @@ cexs <- c(rep(0.5, nrow(zoom_data) - 2), 1.2, 1.2)
 subplot(plot(zoom_data[, 3] + 0.5 * zoom_data[, 4] - 0.5 * zoom_data[, 5], sin(pi / 3) * zoom_data[,4] + sin(pi / 3) * zoom_data[, 5], col = zoom.colors, pch = pchs, cex = cexs,  xlab="", ylab="", xaxt='n', yaxt='n', xlim=c(bottom_x, top_x), ylim=c(bottom_y,top_y)), x = "topleft", size = c(1.7,1.7))
 
 dev.off()
-
-
diff --git a/scripts/GenomeChronicler_plot_generator_fromAncestry.py b/scripts/plot_generator_fromAncestry.py
similarity index 99%
rename from scripts/GenomeChronicler_plot_generator_fromAncestry.py
rename to scripts/plot_generator_fromAncestry.py
index 6309273..2f254eb 100755
--- a/scripts/GenomeChronicler_plot_generator_fromAncestry.py
+++ b/scripts/plot_generator_fromAncestry.py
@@ -1,11 +1,9 @@
-import os
-import re
 from pathlib import Path
 
 import fire
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
 import seaborn as sns
 from matplotlib import patches
 
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 0000000..3176ca8
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,65 @@
+import subprocess
+from pathlib import Path
+
+import fire
+
+
+def clean_bam_file_noCHR(bam_path, output_dir=None):
+    """
+    Reheader the BAM file and create an index
+    - Remove the "chr" prefix from the chromosome names
+
+    Input: .bam
+    Output: .clean.bam, .clean.bam.bai
+
+    Parameters
+    ----------
+    bam_path: str or Path
+        BAM file path
+    output_dir: str, Path or None
+        - If None, the output file will be saved in the same directory as the input file
+        - Else, the output file will be saved in the specified directory
+
+    Returns
+    -------
+    None
+
+    """
+    # Reheader the BAM file and create an index
+    bam_path = Path(bam_path)
+    if output_dir is None:
+        output_dir = bam_path.parent
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    output_path = output_dir/ f"{bam_path.stem}.clean.BAM"
+    subprocess.run(
+        f"samtools reheader -c 'perl -pe \"s/^(@SQ.*)(\tSN:)chr/\$1\$2/\"' {bam_path} > {output_path}",
+        shell=True)
+
+    print(f"\t +++ INFO: Indexing the BAM file")
+    subprocess.run(f"samtools index -@ 6 {output_path}", shell=True)
+
+
+
+funcs = {
+    'clean_bam_file_noCHR': clean_bam_file_noCHR,
+}
+
+
+class UtilsTools(object):
+    def __init__(self):
+        super(UtilsTools, self).__init__()
+
+
+if __name__ == '__main__':
+    for k, v in funcs.items():
+        setattr(UtilsTools, k, staticmethod(v))
+    fire.Fire(UtilsTools)
+
+
+def ucfirst(s):
+    if len(s) > 0:
+        return s[0].upper() + s[1:]
+    else:
+        return s
diff --git a/scripts/GenomeChronicler_vepTables_fromVEP.py b/scripts/vep_tables_from_vep.py
similarity index 80%
rename from scripts/GenomeChronicler_vepTables_fromVEP.py
rename to scripts/vep_tables_from_vep.py
index e3e11fa..d1ef4f2 100755
--- a/scripts/GenomeChronicler_vepTables_fromVEP.py
+++ b/scripts/vep_tables_from_vep.py
@@ -1,39 +1,29 @@
 #!/usr/bin/env python3
-
-import subprocess
-import sys
+import os
 import re
+import subprocess
 from pathlib import Path
 
 import fire
 
-def ucfirst(s):
-    if len(s) > 0:
-        return s[0].upper() + s[1:]
-    else:
-        return s
-
-def get_vepTables_from_VEP(filename, out_dir="."):
+if __name__ == '__main__':
+    import sys
+    sys.path.append(os.path.abspath(os.getcwd()))
 
-    # # Sort out input
-    # if len(sys.argv) != 3:
-    #     sys.exit("Usage: {} VEPhtml OutputDir".format(sys.argv[0]))
+from scripts.utils import ucfirst
 
-    # filename = sys.argv[1]
-    # out_dir = sys.argv[2]
 
-    # if not os.path.exists(filename):
-    #     sys.exit("Major error here (File not found)")
+def get_vep_tables_from_vep(vep_path, output_dir):
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
 
-    out_file = open("{}/latest.summary.csv".format(out_dir), "w")
+    out_file = open(f"{output_dir}/latest.summary.csv", "w")
     out_file.write("Feature\tCount\n")
 
     recording = 0
-    # with (filename, "r") as in_file:
-    with subprocess.Popen(f'cat {filename} | grep "gen_stats"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
+    with subprocess.Popen(f'cat {vep_path} | grep "gen_stats"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
         for line in proc.stdout:
             line = line.decode()
-            # line = line.strip()
             line = line.rstrip("\n")
             if not line:
                 continue
@@ -70,7 +60,7 @@ def get_vepTables_from_VEP(filename, out_dir="."):
     # in_file.close()
     out_file.close()
 
-    with open(filename, "r") as in_file:
+    with open(vep_path, "r") as in_file:
         for line in in_file:
             line = line.strip()
             if not line or "drawTable" not in line or "[" not in line:
@@ -97,7 +87,7 @@ def get_vepTables_from_VEP(filename, out_dir="."):
 
                     counter += 1
 
-                out_file = open("{}/latest.{}.csv".format(out_dir, tab_name), "w")
+                out_file = open("{}/latest.{}.csv".format(output_dir, tab_name), "w")
 
                 head = table.pop(0)
                 head[0] = "Label"
@@ -121,4 +111,4 @@ def get_vepTables_from_VEP(filename, out_dir="."):
 
 
 if __name__ == '__main__':
-    fire.Fire(get_vepTables_from_VEP)
+    fire.Fire(get_vep_tables_from_vep)

From bc35b56a04ba4b4ecef6f7943606b3845dcd088a Mon Sep 17 00:00:00 2001
From: liuxh <lxhlxhlxh555@gmail.com>
Date: Mon, 28 Aug 2023 20:00:04 +0100
Subject: [PATCH 2/2] front jsons dump

---
 main.py                                |  82 +++----
 scripts/genotables_from_afogeno.py     | 312 +++++++++++++------------
 scripts/plot_generator_fromAncestry.py |   1 +
 scripts/to_front_jsons.py              | 197 ++++++++++++++++
 scripts/utils.py                       | 109 ++++++++-
 scripts/vep_tables_from_vep.py         | 114 ---------
 6 files changed, 501 insertions(+), 314 deletions(-)
 create mode 100755 scripts/to_front_jsons.py
 delete mode 100755 scripts/vep_tables_from_vep.py

diff --git a/main.py b/main.py
index 7098656..15e44ba 100644
--- a/main.py
+++ b/main.py
@@ -12,8 +12,9 @@
 from scripts.genotables_from_afogeno import geno_tables_from_afogeno
 from scripts.io import render_latex, export_genotypes_xlsx_from_csvs
 from scripts.plot_generator_fromAncestry import plot_generator_from_ancestry
-from scripts.utils import clean_bam_file_noCHR
-from scripts.vep_tables_from_vep import get_vep_tables_from_vep
+from scripts.to_front_jsons import vep_labex_tables_to_jsons, geno_tables_to_jsons
+from scripts.utils import clean_bam_file_noCHR, get_vep_tables_from_vep
+
 
 def check_path_exist(path, file_type,
                      error_template="--- ERROR: The {file_type} specified in the command line wasn't found [{"
@@ -94,14 +95,11 @@ def main():
     if "SINGULARITY_NAME" in os.environ:
         dir = "/GenomeChronicler/"
 
-    resultsdir = os.getcwd()
-
     # Defining input options and their default values...
     parser = get_parser()
     args = parser.parse_args()
 
     # conf_file = None
-    debugFlag = args.debug
     BAM_file = args.BAM_file
     gVCF_file = args.gVCF_file
     VEP_file = args.VEP_file
@@ -142,15 +140,8 @@ def main():
     sample = None
     if BAM_file:
         sample = Path(BAM_file).name.split('.')[0]
-        # sample = os.path.splitext(os.path.basename(BAM_file))[0]
-        # sample = re.sub(r'\.recal', '', sample)
-        # sample = re.sub(r'\.bam\.clean', '', sample, flags=re.IGNORECASE)
     elif gVCF_file:
         sample = Path(gVCF_file).name.split('.')[0]
-        # sample = os.path.splitext(os.path.basename(gVCF_file))[0]
-        # sample = re.sub(r'\.g\.', '.', sample, flags=re.IGNORECASE)
-        # sample = re.sub(r'\.g\.vcf', '', sample, flags=re.IGNORECASE)
-        # sample = re.sub(r'\.vcf', '', sample, flags=re.IGNORECASE)
     else:
         print_header_ascii()
         print("\t --- MAJOR ERROR: No BAM or gVCF file found. Please check the usage notes above and try again ---\n",
@@ -158,14 +149,6 @@ def main():
         sys.exit(555)
 
     output_dir = Path(resultsdir)
-    temp_dir = output_dir/"temp"
-    temp_dir.mkdir(parents=True, exist_ok=True)
-
-    # os.system(f"mkdir -p {temp_dir}")
-    # check_path_exist(temp_dir, "Results directory", exit_code=102,
-    #                  error_template="\t --- ERROR: Results directory [{path}] can't be found and could not be "
-    #                                 "created. Please check permissions and try again ---\n")
-
     LOGFILE2 = f"{output_dir}/{sample}.processingLog.stderr.txt"
 
     # Main bit of code
@@ -181,12 +164,15 @@ def main():
         output_vep_dir = output_dir/"vep_dir"
         get_vep_tables_from_vep(VEP_file, output_vep_dir)
 
+    clean_bam_dir = None
     if BAM_file:
         print("\t +++ INFO: Preprocessing BAM file")
 
-        output_path = temp_dir/f"{sample}.clean.BAM"
+        clean_bam_dir = output_dir/"clean_bam"
+        clean_bam_dir.mkdir(parents=True, exist_ok=True)
+        output_path = clean_bam_dir/f"{sample}.clean.BAM"
         if not output_path.exists():
-            clean_bam_file_noCHR(BAM_file, temp_dir)
+            clean_bam_file_noCHR(BAM_file, clean_bam_dir)
 
         input_path = output_path
     elif gVCF_file:
@@ -210,9 +196,7 @@ def main():
     if not Path(output_plot_path).exists():
         # cmd = f"PCA_PATH={pca_eigenvec_path} ID={sample} OUTPUT_DIR={output_plot_path} R CMD BATCH {dir}scripts/plot_generator_fromAncestry.R"
         # ret = subprocess.run(cmd, shell=True)
-        # print(ret)
         plot_generator_from_ancestry(pca_eigenvec_path, sample, output_dir)
-        # exit()
 
     print("\t +++ INFO: Generating Genotypes Files")
     output_afogeno_dir = output_dir/"afogeno"
@@ -225,8 +209,10 @@ def main():
 
     print("\t +++ INFO: Generating Genome Report Tables")
     output_tables_dir = output_dir/"tables"
-    geno_tables_from_afogeno(afogeno_file, output_tables_dir)
-    for path in output_tables_dir.glob("*.csv"):
+    geno_tables_from_afogeno(afogeno_file, output_tables_dir, truncate=True)
+    output_tables_full_dir = output_dir/"tables_full"
+    geno_tables_from_afogeno(afogeno_file, output_tables_full_dir, truncate=False)
+    for path in output_tables_full_dir.glob("*.csv"):
         shutil.copy(path, output_dir)
 
     print("\t +++ INFO: Combining Excel Tables")
@@ -249,31 +235,27 @@ def main():
     latex_pdf_path = run_latex_dir/f"{sample}_report.pdf"
     shutil.copy(latex_pdf_path, output_dir) # copy to main results dir
 
+    # dump front jsons
+    output_front_json_dir = output_dir/"front_jsons"
+    geno_tables_to_jsons(output_tables_full_dir, output_dir=output_front_json_dir)
+    vep_labex_tables_to_jsons(output_vep_dir, latex_dir=run_latex_dir, output_dir=output_front_json_dir)
+
+    if no_clean_temporary_files is False:
+        print("\t +++ INFO: Cleaning up Temporary and Intermediate Files")
+
+        if clean_bam_dir is not None:
+            shutil.rmtree(clean_bam_dir, ignore_errors=True)
+
+        shutil.rmtree(output_ancestry_dir, ignore_errors=True)
+        shutil.rmtree(output_afogeno_dir, ignore_errors=True)
+        # shutil.rmtree(output_tables_dir, ignore_errors=True)
+        # shutil.rmtree(output_tables_full_dir, ignore_errors=True)
+        shutil.rmtree(run_latex_dir, ignore_errors=True)
+
+        if output_vep_dir is not None:
+            shutil.rmtree(output_vep_dir, ignore_errors=True)
 
-    # if no_clean_temporary_files is False:
-    #     print("\t +++ INFO: Cleaning up Temporary and Intermediate Files")
-    #     if BAM_file is not None:
-    #         os.remove(BAM_file)
-    #         os.remove(f"{BAM_file}.bai")
-    #
-    #     shutil.rmtree(f"{temp_dir}")
-    #     for file in Path(f"{output_dir}/").glob("latest*.csv"):
-    #         os.remove(file)
-    #
-    #     subprocess.run(f"rm -rf {output_dir}/versionTable.txt", shell=True)
-    #     subprocess.run(f"rm -rf {output_dir}/GeneStructure.pdf", shell=True)
-    #     subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.out", shell=True)
-    #     subprocess.run(f"rm -rf {output_dir}/texput.log", shell=True)
-    #     subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.aux", shell=True)
-    #     subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.log", shell=True)
-    #     subprocess.run(f"rm -rf {output_dir}/{TEMPLATETEX}.tex", shell=True)
-    #     subprocess.run(f"rm -rf {dir}GenomeChronicler_plot_generator_fromAncestry.Rout", shell=True)
-
-    # # time.sleep(1)
-    # if BAM_file is not None:
-    #     BAM_file = BAM_file.replace(".clean.BAM", "")
-    #     print(
-    #         f"\n\t +++ DONE: Finished GenomeChronicler for file [ {BAM_file} ] in {datetime.now() - start_time} seconds")
+    print(f"\t +++ DONE: Finished GenomeChronicler for file [ {input_path} ] in {datetime.now() - start_time} seconds")
 
 
 if __name__ == '__main__':
diff --git a/scripts/genotables_from_afogeno.py b/scripts/genotables_from_afogeno.py
index 2de75b5..a9c7ed7 100755
--- a/scripts/genotables_from_afogeno.py
+++ b/scripts/genotables_from_afogeno.py
@@ -140,7 +140,7 @@ def generate_ClinVar_url(rsid, sth5, sth5_sql):
         return ""
 
 
-def geno_tables_from_afogeno(afogeno_path, output_dir, verbose=False, version="22-146", reference_dir="reference"):
+def geno_tables_from_afogeno(afogeno_path, output_dir, verbose=False, version="22-146", reference_dir="reference", truncate=True):
     """
     This function generates the geno tables from the afogeno output.
 
@@ -246,159 +246,183 @@ def geno_tables_from_afogeno(afogeno_path, output_dir, verbose=False, version="2
 
     # output_path = f'{outdir}/latest.good.reportTable.csv'
     # if not Path(output_path).exists():
-    if True:
-        IN = open(afogeno_path)
-        rows_GOOD = []
-        rows_BAD = []
-        Path(f"{temp_dir}/latest.good.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n")
-        Path(f"{temp_dir}/latest.bad.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n")
-        proc_good = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.good.reportTable.csv",
-                                     shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
-        proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.bad.reportTable.csv",
-                                    shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
-    
-        for line in tqdm(IN, disable=not verbose):
-            # for line in IN:
-            line = line.strip()
-            if line == "":
-                continue
-            if line.startswith("#"):
+
+    IN = open(afogeno_path)
+    # rows_GOOD = []
+    # rows_BAD = []
+    # Path(f"{temp_dir}/latest.good.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n")
+    # Path(f"{temp_dir}/latest.bad.reportTable.csv").write_text("Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar\n")
+    # proc_good = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.good.reportTable.csv",
+    #                              shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+    # proc_bad = subprocess.Popen(f"sort | uniq | sort -t \',\' -k1,1nr >> {temp_dir}/latest.bad.reportTable.csv",
+    #                             shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+    entries_good = []
+    entries_bad = []
+    for line in tqdm(IN, disable=not verbose):
+        # for line in IN:
+        line = line.strip()
+        if line == "":
+            continue
+        if line.startswith("#"):
+            continue
+
+        debugBuffer = ""
+
+        data = line.split("\t")
+        if len(data) < 12:
+            data.append("")
+
+        debugBuffer += "\n+++\n" + "\t".join(data) + "\n"
+
+        strand = "plus"
+        snp = data[3]
+
+        # rvS = sth1s.execute(snp)
+        rvS = sth1s.execute(sth1s_sql, (snp,))
+        counter = 0
+        for row in sth1s.fetchall():
+            strand = row[1]
+            counter += 1
+        if counter > 1:
+            raise Exception("Oh my Gawd!!! DNA has two strands (and it isn't a good thing in this case)")
+
+        # rvFlag = sth1f.execute(snp)
+        rvS = sth1f.execute(sth1f_sql, (snp,))
+        flagID = 0
+        for row in sth1f.fetchall():
+            flagID = 1
+        if flagID == 1 and (data[8] == "0/0" or data[8] == "./."):
+            continue
+
+        # strand
+        if strand == "minus":
+            data[9] = data[9].translate(str.maketrans("ACTGactg", "TGACtgac"))
+            data[10] = data[10].translate(str.maketrans("ACTGactg", "TGACtgac"))
+
+        genotype = [data[9], data[10]]
+        extra = data[11]
+
+        genotypes[snp] = {}
+        if extra != "":
+            genotypes[snp][extra] = 1
+        genotypes[snp][genotype[0]] = 1
+        genotypes[snp][genotype[1]] = 1
+        genotypes[snp][";".join(genotype)] = 1
+
+        try:
+            rv = sth1c.execute(sth1c_sql, (snp,))
+        except Exception as e:
+            print(data)
+            print(e)
+            continue
+        alleles = {}
+        countGenotype = 0
+        for row in sth1c.fetchall():
+            debugBuffer += "ID = " + str(row[0]) + "\n"
+            debugBuffer += "REPUTE = " + str(row[1]) + "\n"
+            debugBuffer += "MAGNITUDE = " + str(row[2]) + "\n"
+            debugBuffer += "ALLELE1 =  " + str(row[3]) + "\n"
+            debugBuffer += "ALLELE2 =  " + str(row[4]) + "\n"
+            debugBuffer += "SUMMARY =  " + str(row[5]) + "\n"
+            debugBuffer += "RSID =  " + str(row[6]) + "\n"
+            debugBuffer += "IID =  " + str(row[7]) + "\n"
+
+            row = list(row)
+            row.append(row[5])
+            row[5] = cleanSummaryString(row[5])
+            # if truncate:
+            #     row[5] = cleanSummaryString(row[5])
+            # if "," in row[5]:
+            #     row[5] = f'"{row[5]}"'
+
+            a1 = row[3]
+            a2 = row[4]
+
+            # if row[2] is None or row[2] == 0:
+            if row[2] is None or row[2] == 0 or row[2] == "0": # Excluding magnitude 0
                 continue
 
-            debugBuffer = ""
+            alleles.setdefault(a1, {})[a2] = [row[1], row[0], row[2], f"({a1};{a2})", row[5], row[-1]]
+            alleles.setdefault(a2, {})[a1] = alleles[a1][a2]
 
-            data = line.split("\t")
-            if len(data) < 12:
-                data.append("")
+            countGenotype += 1
 
-            debugBuffer += "\n+++\n" + "\t".join(data) + "\n"
 
-            strand = "plus"
-            snp = data[3]
 
-            # rvS = sth1s.execute(snp)
-            rvS = sth1s.execute(sth1s_sql, (snp,))
-            counter = 0
-            for row in sth1s.fetchall():
-                strand = row[1]
-                counter += 1
-            if counter > 1:
-                raise Exception("Oh my Gawd!!! DNA has two strands (and it isn't a good thing in this case)")
+        if countGenotype:
+            if genotype[0] in alleles and genotype[1] in alleles[genotype[0]]:
 
-            # rvFlag = sth1f.execute(snp)
-            rvS = sth1f.execute(sth1f_sql, (snp,))
-            flagID = 0
-            for row in sth1f.fetchall():
-                flagID = 1
-            if flagID == 1 and (data[8] == "0/0" or data[8] == "./."):
-                continue
+                geno_desc = alleles[genotype[0]][genotype[1]][4]
+                geno_desc_full = alleles[genotype[0]][genotype[1]][5]
+                #Implement some filters here
+                if re.findall(r"^\s*$|^Common|^None|^Normal|^Average|^Benign most likely|^Ancestral value|^Unaffected Genotype", geno_desc, re.IGNORECASE):
+                    continue
+                if re.findall(r"^Benign variant|^L22- S142-|^Depends on|^Extensive metabolizer", geno_desc, re.IGNORECASE):
+                    continue
+                if re.findall(r"^Typical BuChE|Complex; generally normal risk|common in clinvar|Major allele, normal risk|Slight if any|Likely to be benign|Most common genotype", geno_desc, re.IGNORECASE):
+                    continue
+                if re.findall(r"^Most likely a benign polymorphism|Benign \(harmless\) variant|^1.3x risk$", geno_desc, re.IGNORECASE):
+                    continue
+                if re.findall(r"normal risk of migraine|More likely to go bald; common|Most likely benign polymorphism|Slightly increased lifespan?|^Benign polymorphism", geno_desc, re.IGNORECASE):
+                    continue
+                if re.findall(r"No increased risk of |Likely to be a benign variant|Carrier of a benign change|Classified as benign in ClinVar", geno_desc, re.IGNORECASE):
+                    continue
 
-            # strand
-            if strand == "minus":
-                data[9] = data[9].translate(str.maketrans("ACTGactg", "TGACtgac"))
-                data[10] = data[10].translate(str.maketrans("ACTGactg", "TGACtgac"))
-
-            genotype = [data[9], data[10]]
-            extra = data[11]
-
-            genotypes[snp] = {}
-            if extra != "":
-                genotypes[snp][extra] = 1
-            genotypes[snp][genotype[0]] = 1
-            genotypes[snp][genotype[1]] = 1
-            genotypes[snp][";".join(genotype)] = 1
-
-            try:
-                rv = sth1c.execute(sth1c_sql, (snp,))
-            except Exception as e:
-                print(data)
-                print(e)
-                continue
-            alleles = {}
-            countGenotype = 0
-            for row in sth1c.fetchall():
-                debugBuffer += "ID = " + str(row[0]) + "\n"
-                debugBuffer += "REPUTE = " + str(row[1]) + "\n"
-                debugBuffer += "MAGNITUDE = " + str(row[2]) + "\n"
-                debugBuffer += "ALLELE1 =  " + str(row[3]) + "\n"
-                debugBuffer += "ALLELE2 =  " + str(row[4]) + "\n"
-                debugBuffer += "SUMMARY =  " + str(row[5]) + "\n"
-                debugBuffer += "RSID =  " + str(row[6]) + "\n"
-                debugBuffer += "IID =  " + str(row[7]) + "\n"
-
-                row = list(row)
-                row[5] = cleanSummaryString(row[5])
-
-                a1 = row[3]
-                a2 = row[4]
-
-                # if row[2] is None or row[2] == 0:
-                if row[2] is None or row[2] == 0 or row[2] == "0": # Excluding magnitude 0
+                rsid = alleles[genotype[0]][genotype[1]][1]
+                gnomAD = generate_GnomAD_url(rsid, sth3, sth3_sql)
+                getevidence = generate_get_evidence_url(rsid, sth4, sth4_sql)
+                snpedia = generate_SNPedia_url(rsid)
+                clinvar = generate_ClinVar_url(rsid, sth5, sth5_sql)
+
+                if gnomAD == "" and getevidence == "" and clinvar == "":
                     continue
 
-                alleles.setdefault(a1, {})[a2] = [row[1], row[0], row[2], f"({a1};{a2})", row[5]]
-                alleles.setdefault(a2, {})[a1] = alleles[a1][a2]
-
-                countGenotype += 1
-
-
-
-            if countGenotype:
-                if genotype[0] in alleles and genotype[1] in alleles[genotype[0]]:
-
-                    geno_desc = alleles[genotype[0]][genotype[1]][4]
-                    #Implement some filters here
-                    if re.findall(r"^\s*$|^Common|^None|^Normal|^Average|^Benign most likely|^Ancestral value|^Unaffected Genotype", geno_desc, re.IGNORECASE):
-                        continue
-                    if re.findall(r"^Benign variant|^L22- S142-|^Depends on|^Extensive metabolizer", geno_desc, re.IGNORECASE):
-                        continue
-                    if re.findall(r"^Typical BuChE|Complex; generally normal risk|common in clinvar|Major allele, normal risk|Slight if any|Likely to be benign|Most common genotype", geno_desc, re.IGNORECASE):
-                        continue
-                    if re.findall(r"^Most likely a benign polymorphism|Benign \(harmless\) variant|^1.3x risk$", geno_desc, re.IGNORECASE):
-                        continue
-                    if re.findall(r"normal risk of migraine|More likely to go bald; common|Most likely benign polymorphism|Slightly increased lifespan?|^Benign polymorphism", geno_desc, re.IGNORECASE):
-                        continue
-                    if re.findall(r"No increased risk of |Likely to be a benign variant|Carrier of a benign change|Classified as benign in ClinVar", geno_desc, re.IGNORECASE):
-                        continue
-
-                    rsid = alleles[genotype[0]][genotype[1]][1]
-                    gnomAD = generate_GnomAD_url(rsid, sth3, sth3_sql)
-                    getevidence = generate_get_evidence_url(rsid, sth4, sth4_sql)
-                    snpedia = generate_SNPedia_url(rsid)
-                    clinvar = generate_ClinVar_url(rsid, sth5, sth5_sql)
-
-                    if gnomAD == "" and getevidence == "" and clinvar == "":
-                        continue
-
-                    if rsid in tmpBlacklist:
-                        continue
-
-                    alleles[genotype[0]][genotype[1]][1] = snpedia
-                    alleles[genotype[0]][genotype[1]][4] = generate_Summary_url(rsid,
-                                                                                alleles[genotype[0]][genotype[1]][
-                                                                                    4])
-                    tmpString = ",".join(
-                        [alleles[genotype[0]][genotype[1]][2], alleles[genotype[0]][genotype[1]][1],
-                         alleles[genotype[0]][genotype[1]][3], alleles[genotype[0]][genotype[1]][4]])
-                    tmpString = tmpString.encode('ascii', 'ignore').decode()  # Removing non-ASCII characters
-                    if alleles[genotype[0]][genotype[1]][2] is None or alleles[genotype[0]][genotype[1]][2] == "":
-                        continue
-                    elif alleles[genotype[0]][genotype[1]][0] is None or alleles[genotype[0]][genotype[1]][0] == "":
-                        # print("WARNING: Go here and with a fair sense of justice determine is this is good or bad\n", tmpString, "\n")
-                        continue
-                    elif alleles[genotype[0]][genotype[1]][0] == "Good":
-                        proc_good.stdin.write(f"{tmpString},{gnomAD},{getevidence},{clinvar}\n".encode())
-                    elif alleles[genotype[0]][genotype[1]][0] == "Bad":
-                        proc_bad.stdin.write(f"{tmpString},{gnomAD},{getevidence},{clinvar}\n".encode())
-
-                elif countGenotype > 2:
-                    print("IMPORTANT: ", debugBuffer)
-                    print("IMPORTANT: ", debugBuffer)
-                    print("IMPORTANT: Please debug this here as I couldn't find the right alleles [", strand, "] [",
-                          countGenotype, "]\n\n\n")
-
-        proc_good.stdin.close()
-        proc_bad.stdin.close()
+                if rsid in tmpBlacklist:
+                    continue
+
+                desc = geno_desc_full
+                if truncate:
+                    desc = geno_desc
+                alleles[genotype[0]][genotype[1]][1] = snpedia
+                alleles[genotype[0]][genotype[1]][4] = generate_Summary_url(rsid, desc)
+                entry = [alleles[genotype[0]][genotype[1]][2], alleles[genotype[0]][genotype[1]][1],
+                            alleles[genotype[0]][genotype[1]][3], alleles[genotype[0]][genotype[1]][4]]
+
+                # tmpString = ",".join(
+                #     [alleles[genotype[0]][genotype[1]][2], alleles[genotype[0]][genotype[1]][1],
+                #      alleles[genotype[0]][genotype[1]][3], alleles[genotype[0]][genotype[1]][4]])
+                # tmpString = tmpString.encode('ascii', 'ignore').decode()  # Removing non-ASCII characters
+                if alleles[genotype[0]][genotype[1]][2] is None or alleles[genotype[0]][genotype[1]][2] == "":
+                    continue
+                elif alleles[genotype[0]][genotype[1]][0] is None or alleles[genotype[0]][genotype[1]][0] == "":
+                    # print("WARNING: Go here and with a fair sense of justice determine is this is good or bad\n", tmpString, "\n")
+                    continue
+                elif alleles[genotype[0]][genotype[1]][0] == "Good":
+                    # proc_good.stdin.write(f"{tmpString},{gnomAD},{getevidence},{clinvar}\n".encode())
+                    entry += [gnomAD, getevidence, clinvar]
+                    entries_good.append(entry)
+                elif alleles[genotype[0]][genotype[1]][0] == "Bad":
+                    entry += [gnomAD, getevidence, clinvar]
+                    entries_bad.append(entry)
+                    # proc_bad.stdin.write(f"{tmpString},{gnomAD},{getevidence},{clinvar}\n".encode())
+
+            elif countGenotype > 2:
+                print("IMPORTANT: ", debugBuffer)
+                print("IMPORTANT: ", debugBuffer)
+                print("IMPORTANT: Please debug this here as I couldn't find the right alleles [", strand, "] [",
+                      countGenotype, "]\n\n\n")
+
+    # proc_good.stdin.close()
+    # proc_bad.stdin.close()
+    cols = "Mag.,Identifier,Genotype,Summary,GnomAD,GetEvidence,ClinVar".split(",")
+    df_good = pd.DataFrame(entries_good, columns=cols)
+    df_good = df_good.sort_values(by=["Mag."], ascending=False)
+    df_good = df_good.drop_duplicates()
+    df_good.to_csv(f"{temp_dir}/latest.good.reportTable.csv", index=False)
+    df_bad = pd.DataFrame(entries_bad, columns=cols)
+    df_bad = df_bad.sort_values(by=["Mag."], ascending=False)
+    df_bad = df_bad.drop_duplicates()
+    df_bad.to_csv(f"{temp_dir}/latest.bad.reportTable.csv", index=False)
 
 
     positiveGenosets = {}
diff --git a/scripts/plot_generator_fromAncestry.py b/scripts/plot_generator_fromAncestry.py
index 2f254eb..7a206eb 100755
--- a/scripts/plot_generator_fromAncestry.py
+++ b/scripts/plot_generator_fromAncestry.py
@@ -120,6 +120,7 @@ def plot_generator_from_ancestry(eigenvec_path, sample_id, output_dir):
     ax_zoom.set_yticks([])
 
     fig.savefig(f"{output_dir}/AncestryPlot.pdf", format='pdf', dpi=150, bbox_inches='tight')
+    fig.savefig(f"{output_dir}/AncestryPlot.png", format='png', dpi=150, bbox_inches='tight')
 
 if __name__ == '__main__':
     fire.Fire(plot_generator_from_ancestry)
\ No newline at end of file
diff --git a/scripts/to_front_jsons.py b/scripts/to_front_jsons.py
new file mode 100755
index 0000000..5d21884
--- /dev/null
+++ b/scripts/to_front_jsons.py
@@ -0,0 +1,197 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+import re
+
+import sqlite3
+import fire
+import pandas as pd
+from tqdm import tqdm
+
+
+def link_warper_split(x):
+    tokens = x.split('}{')
+    link = tokens[0].split('\\href{')[-1]
+    text = tokens[1].strip('{}')
+    return text, link
+
+def row2entry(row):
+    data = {}
+    if 'Magnitude' in row:
+        magnitude = row['Magnitude']
+    elif 'Mag.' in row:
+        magnitude = row['Mag.']
+    else:
+        magnitude = None
+    data['magnitude'] = magnitude
+
+    if 'Genotype' in row:
+        data['genotype'] = row['Genotype'].strip('()').split(';')
+
+    text, _ = link_warper_split(row['Summary'])
+    desc_full = text
+    desc_short = text
+    if len(text) > 50:
+        desc_short = text[:47] + '...'
+    data['description'] = {
+        'full': desc_full,
+        'short': desc_short,
+    }
+    id_name, id_link = link_warper_split(row['Identifier'])
+    data['identifier'] = {
+        'link': id_link,
+        'name': id_name,
+    }
+    db_links = []
+    db_links += [{'link': id_link, 'name': 'SNPedia'}]
+    for db_name in 'GnomAD GetEvidence ClinVar'.split():
+        if db_name not in row:
+            continue
+        try:
+            _, link = link_warper_split(row[db_name])
+            db_links += [{'link': link, 'name': db_name}]
+        except:
+            db_links += [{'link': '', 'name': db_name}]
+    data['db_links'] = db_links
+    return data
+
+
+
+def geno_tables_to_jsons(table_full_dir, output_dir):
+    table_full_dir = Path(table_full_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+    tables = {
+        'beneficial_data.json': 'latest.good.reportTable.csv',
+        'harmful_data.json': 'latest.bad.reportTable.csv',
+        'genosets_data.json': 'latest.genoset.reportTable.csv',
+    }
+    for json_name, table_name in tables.items():
+        path_table = table_full_dir / table_name
+        df = pd.read_csv(path_table)
+        data = df.apply(row2entry,axis=1).tolist()
+        # Path()
+        # data = table_to_json_data(df)
+        path_json = output_dir / json_name
+        path_json.write_text(json.dumps(data, indent=2))
+        # pd.DataFrame(data).to_json(path_json, orient='records', indent=2)
+
+
+json_templates = {
+    'consequence_type_data.json': {
+        "labels": [],
+        "datasets": [{
+            "label": "# of Votes",
+            "data": [],
+            "backgroundColor": ["#6699cc", "#fff275", "#ff8c42", "#ff3c38", "#a23e48", "#66635b", "#4e5166"],
+            "borderColor": ["#6699cc", "#fff275", "#ff8c42", "#ff3c38", "#a23e48", "#66635b", "#4e5166"],
+            "borderWidth": 1
+        }]
+    },
+    'polyphen_summary_data.json': {
+        "labels": [],
+        "datasets": [{
+            "label": "# of Votes",
+            "data": [],
+            "backgroundColor": ["#ff7700","#004777","#a30000","#efd28d"],
+            "borderColor": ["#ff7700","#004777","#a30000","#efd28d"],
+            "borderWidth": 1
+        }]
+    },
+    'variant_class_data.json': {
+        "labels": [],
+        "datasets": [{
+            "label": "# of Votes",
+            "data": [],
+            "backgroundColor": ["#ee6352","#59cd90","#3fa7d6","#fac05e","#f79d84"],
+            "borderColor": ["#ee6352","#59cd90","#3fa7d6","#fac05e","#f79d84"],
+            "borderWidth": 1
+        }]
+    }
+}
+
+labels_order = {
+    'consequence_type_data.json': [ # order?
+        'Intron variant',
+        'Non coding transcript variant',
+        'Others',
+        'Intergenic variant',
+        'Regulatory region variant',
+        'Downstream gene variant',
+        'Upstream gene variant',
+    ],
+}
+
+def vep_table_to_pie_json(df, json_name):
+    # Initialize JSON data structure
+    json_data = json_templates[json_name].copy()
+    if json_name in labels_order:
+        labels = labels_order[json_name]
+    else:
+        df = df.sort_values('Count', ascending=False)
+        labels = df['Label'].tolist()
+
+    # Fill JSON data structure
+    json_data["labels"] = []
+    data_table = df.set_index('Label')['Count']
+    for label in labels:
+        if label not in data_table:
+            continue
+        count = data_table[label]
+        json_data["labels"].append(f"{label} ({count:.2f}%)")
+        json_data["datasets"][0]["data"].append(round(count, 2))
+    return json_data
+
+
+def vep_labex_tables_to_jsons(vep_dir, latex_dir, output_dir):
+    if vep_dir is not None:
+        vep_dir = Path(vep_dir)
+        output_dir = Path(output_dir)
+        output_dir.mkdir(exist_ok=True, parents=True)
+        tables = {
+            'consequence_type_data.json': 'latest.consequences_table.csv',
+            'polyphen_summary_data.json': 'latest.polyphen_table.csv',
+            'variant_class_data.json': 'latest.var_class_table.csv',
+        }
+        for json_name, table_name in tables.items():
+            path_table = vep_dir / table_name
+            df = pd.read_csv(path_table)
+            json_data = vep_table_to_pie_json(df, json_name)
+            path_json = output_dir / json_name
+            path_json.write_text(json.dumps(json_data, indent=2))
+
+    if latex_dir is not None:
+        latex_dir = Path(latex_dir)
+        output_dir = Path(output_dir)
+        output_dir.mkdir(exist_ok=True, parents=True)
+        # report_metadata.json
+        output_path = output_dir / 'report_metadata.json'
+        df_version = pd.read_csv(latex_dir / 'versionTable.txt', sep='\t')
+        df_version['db_links'] = df_version.apply(lambda x: [{'name':x['Resource'],'link':link_warper_split(x['Website'])[1]}],axis=1)
+        df_version = df_version.rename(columns={'Resource':'resource','Version':'version'})
+        df_version = df_version.drop(columns=['Website'])
+        df_version.to_json(output_path, orient='records', indent=2)
+
+        # variant_calling_data.json
+        output_path = output_dir / 'variant_calling_data.json'
+        df_sum = pd.read_csv(latex_dir / 'latest.summary.csv',sep='\t')
+        df_sum = df_sum.reset_index().rename(columns={'index':'key'})
+        df_sum['key'] = df_sum['key']+1
+        df_sum = df_sum.rename(columns={'Feature':'feature','Count':'count'})
+        df_sum.to_json(output_path, orient='records', indent=2)
+
+funcs = {
+    'geno_tables_to_jsons': geno_tables_to_jsons,
+    'vep_labex_tables_to_jsons': vep_labex_tables_to_jsons,
+}
+
+class Tools(object):
+    def __init__(self):
+        super(Tools, self).__init__()
+
+
+if __name__ == '__main__':
+    for k, v in funcs.items():
+        setattr(Tools, k, staticmethod(v))
+    fire.Fire(Tools)
diff --git a/scripts/utils.py b/scripts/utils.py
index 3176ca8..0de8e0c 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -1,8 +1,15 @@
+import re
 import subprocess
 from pathlib import Path
 
 import fire
 
+def ucfirst(s):
+    if len(s) > 0:
+        return s[0].upper() + s[1:]
+    else:
+        return s
+
 
 def clean_bam_file_noCHR(bam_path, output_dir=None):
     """
@@ -41,9 +48,105 @@ def clean_bam_file_noCHR(bam_path, output_dir=None):
     subprocess.run(f"samtools index -@ 6 {output_path}", shell=True)
 
 
+def get_vep_tables_from_vep(vep_path, output_dir):
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    out_file = open(f"{output_dir}/latest.summary.csv", "w")
+    out_file.write("Feature\tCount\n")
+
+    recording = 0
+    with subprocess.Popen(f'cat {vep_path} | grep "gen_stats"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
+        for line in proc.stdout:
+            line = line.decode()
+            line = line.rstrip("\n")
+            if not line:
+                continue
+
+            # Note for future generations: This is as silly as Perl gets, but it does the job (for now). I will re-write this with proper parsing at some point, or even better, use an off-the-shelve HTML parsing module.
+            line = line.replace("<tr>", "\n")
+            line = re.sub(r"</table>", "\n</table>\n", line)
+            line = re.sub(r"<td>", "\t", line)
+            line = re.sub(r"</td>|</tr>", "", line)
+
+            lines = line.split("\n")
+            for l in lines:
+                if "stats_table" in l:
+                    recording += 1
+                    continue
+                elif ("</table>" in l.lower()) and (recording == 2):
+                    recording = 0
+
+                # l = l.strip()
+                l = re.sub(r"^\s+", "", l)
+                l = re.sub(r"\s+$", "", l)
+                l = re.sub(r"\s{2,}", "\t", l)
+
+                if ("variants processed" in l.lower()):
+                    continue
+                if ("lines of output written" in l.lower()):
+                    continue
+
+
+
+                if recording == 2:
+                    out_file.write("{}\n".format(l))
+
+    # in_file.close()
+    out_file.close()
+
+    with open(vep_path, "r") as in_file:
+        for line in in_file:
+            line = line.strip()
+            if not line or "drawTable" not in line or "[" not in line:
+                continue
+
+            match = re.search(r"drawTable\('(.+?)'.+?\[(.+)\]", line)
+            if match:
+                tab_name = match.group(1)
+                tab_string = match.group(2)
+
+                counter = 0
+                summer = 0
+                table = []
+                for pair in re.findall(r"\[(.+?)\]", tab_string):
+                    pair = pair.replace("'", "")
+                    pair = pair.replace("_", " ")
+                    if counter == 0:
+                        pair = pair.replace(" ", "")
+                    split = pair.split(",")
+                    table.append(split)
+
+                    if counter > 0:
+                        summer += int(split[1])
+
+                    counter += 1
+
+                out_file = open("{}/latest.{}.csv".format(output_dir, tab_name), "w")
+
+                head = table.pop(0)
+                head[0] = "Label"
+                out_file.write("{}\n".format(",".join(head)))
+
+                other = 0
+                for d in table:
+                    perc = f"{round(float(d[1]) / summer * 100, 2):0.2f}"
+                    if float(perc) > 1:
+                        d[0] = ucfirst(d[0])
+                        d[1] = perc
+                        out_file.write("{}\n".format(",".join([str(x) for x in d])))
+                    else:
+                        other += int(d[1])
+
+                if other > 0:
+                    perc = f"{round(float(other) / summer * 100, 2):0.2f}"
+                    out_file.write(f"Others,{perc}\n")
+                    # out_file.write("{}\n".format(",".join([str(x) for x in d])))
+
 
 funcs = {
     'clean_bam_file_noCHR': clean_bam_file_noCHR,
+    'get_vep_tables_from_vep': get_vep_tables_from_vep,
 }
 
 
@@ -57,9 +160,3 @@ def __init__(self):
         setattr(UtilsTools, k, staticmethod(v))
     fire.Fire(UtilsTools)
 
-
-def ucfirst(s):
-    if len(s) > 0:
-        return s[0].upper() + s[1:]
-    else:
-        return s
diff --git a/scripts/vep_tables_from_vep.py b/scripts/vep_tables_from_vep.py
deleted file mode 100755
index d1ef4f2..0000000
--- a/scripts/vep_tables_from_vep.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/env python3
-import os
-import re
-import subprocess
-from pathlib import Path
-
-import fire
-
-if __name__ == '__main__':
-    import sys
-    sys.path.append(os.path.abspath(os.getcwd()))
-
-from scripts.utils import ucfirst
-
-
-def get_vep_tables_from_vep(vep_path, output_dir):
-    output_dir = Path(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    out_file = open(f"{output_dir}/latest.summary.csv", "w")
-    out_file.write("Feature\tCount\n")
-
-    recording = 0
-    with subprocess.Popen(f'cat {vep_path} | grep "gen_stats"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
-        for line in proc.stdout:
-            line = line.decode()
-            line = line.rstrip("\n")
-            if not line:
-                continue
-
-            # Note for future generations: This is as silly as Perl gets, but it does the job (for now). I will re-write this with proper parsing at some point, or even better, use an off-the-shelve HTML parsing module.
-            line = line.replace("<tr>", "\n")
-            line = re.sub(r"</table>", "\n</table>\n", line)
-            line = re.sub(r"<td>", "\t", line)
-            line = re.sub(r"</td>|</tr>", "", line)
-
-            lines = line.split("\n")
-            for l in lines:
-                if "stats_table" in l:
-                    recording += 1
-                    continue
-                elif ("</table>" in l.lower()) and (recording == 2):
-                    recording = 0
-
-                # l = l.strip()
-                l = re.sub(r"^\s+", "", l)
-                l = re.sub(r"\s+$", "", l)
-                l = re.sub(r"\s{2,}", "\t", l)
-
-                if ("variants processed" in l.lower()):
-                    continue
-                if ("lines of output written" in l.lower()):
-                    continue
-
-
-
-                if recording == 2:
-                    out_file.write("{}\n".format(l))
-
-    # in_file.close()
-    out_file.close()
-
-    with open(vep_path, "r") as in_file:
-        for line in in_file:
-            line = line.strip()
-            if not line or "drawTable" not in line or "[" not in line:
-                continue
-
-            match = re.search(r"drawTable\('(.+?)'.+?\[(.+)\]", line)
-            if match:
-                tab_name = match.group(1)
-                tab_string = match.group(2)
-
-                counter = 0
-                summer = 0
-                table = []
-                for pair in re.findall(r"\[(.+?)\]", tab_string):
-                    pair = pair.replace("'", "")
-                    pair = pair.replace("_", " ")
-                    if counter == 0:
-                        pair = pair.replace(" ", "")
-                    split = pair.split(",")
-                    table.append(split)
-
-                    if counter > 0:
-                        summer += int(split[1])
-
-                    counter += 1
-
-                out_file = open("{}/latest.{}.csv".format(output_dir, tab_name), "w")
-
-                head = table.pop(0)
-                head[0] = "Label"
-                out_file.write("{}\n".format(",".join(head)))
-
-                other = 0
-                for d in table:
-                    perc = f"{round(float(d[1]) / summer * 100, 2):0.2f}"
-                    if float(perc) > 1:
-                        d[0] = ucfirst(d[0])
-                        d[1] = perc
-                        out_file.write("{}\n".format(",".join([str(x) for x in d])))
-                    else:
-                        other += int(d[1])
-
-                if other > 0:
-                    perc = f"{round(float(other) / summer * 100, 2):0.2f}"
-                    out_file.write(f"Others,{perc}\n")
-                    # out_file.write("{}\n".format(",".join([str(x) for x in d])))
-                   
-
-
-if __name__ == '__main__':
-    fire.Fire(get_vep_tables_from_vep)