SNPsplit_genome_preparation

#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Long;
use FindBin qw($Bin);
use lib "$Bin/../lib";
use Cwd;

## This program is Copyright (C) 2014-23, Felix Krueger (fkrueger@altoslabs.com)

## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.


### This script filters the latest VCF file for various SNPs versus the GRCm39 mouse genome build and writes high confidence SNPs into a folder called 'SNPs_Sanger';

### Update Dec 2022: The current version of the the Mouse Genomes Project (https://www.mousegenomes.org/) is v8. The SNP file may ne obtained here: 
### https://ftp.ebi.ac.uk/pub/databases/mousegenomes/REL-2112-v8-SNPs_Indels/: "mgp_REL2021_snps.vcf.gz"

### Older versions of the SNPs/genomes are no longer supported but might still work (e.g.: 'mgp.v5.merged.snps_all.dbSNP142.vcf.gz'). However, please note that older
### files also use the, now outdated, genome build GRCm38!

# Modifying 20 December 2022 - to accept the new release v8 and work with the latest Mouse Genome Project 
# https://ftp.ebi.ac.uk/pub/databases/mousegenomes/REL-2112-v8-SNPs_Indels/  "mgp_REL2021_snps.vcf.gz"

## Reading in a BAM or SAM file
my $pipeline_version = '0.6.0';
my $parent_dir = getcwd();
my ($vcf_file,$strain,$strain2,$strain_index,$strain2_index,$genome_folder,$skip_filtering,$nmasking,$full_sequence,$dual_hybrid,$genome_build,$v7) = process_commandline ();

my %snps; # storing all filtered SNPs
my %snps_dual_genome; # storing all SNPs for dual genome
my %homozygous_SNPs;  # storing SNP genotypes and FILTER value for dual hybrids

my $snp_file_strain =   "all_SNPs_${strain}_${genome_build}.txt.gz";
my $snp_file_strain2;
if ($strain2){
    $snp_file_strain2 = "all_SNPs_${strain2}_${genome_build}.txt.gz";
}
else{
    $snp_file_strain2 = 'irrelevant_for_single_hybrid_mode';
}
my $new_ref_snp_annoations; # this will store the new Ref/SNP annotations for dual hybrids

warn "Summarising SNPsplit Genome Preparation Parameters\n";
warn "="x50,"\n";
unless ($skip_filtering){
    warn "Processing SNPs from VCF file:\t\t$vcf_file\n";
}

if ($skip_filtering){
    warn "Reading/filtering VCF file:\t\tNo (skipped by user)\n";
}
else{
    warn "Reading/filtering VCF file:\t\tYes (default)\n";
}

warn "Reference genome:\t\t\t$genome_folder\n";
# N-masking
if ($nmasking){
    warn "N-masking:\t\t\t\tYes\n";
}
else{
    warn "N-masking:\t\t\t\tNo\n";
}

# Full SNP incorporation
if ($full_sequence){
    warn "Full SNP genome:\t\t\tYes\n";
}
else{
    warn "Full SNP genome:\t\t\tNo\n";
}
warn "SNP strain:\t\t\t\t$strain\n";
if ($strain2){
    warn "SNP strain 2:\t\t\t\t$strain2\n";
}
if ($dual_hybrid){
    warn "Dual hybrid, new Ref/SNP:\t\t$strain/$strain2\n";
}
warn "\n";

### Dealing with chromosomes
my @chroms;
my %chroms; # here we will keep a record whether a chromosome had been covered with SNPs (this is dictated by the VCF file)

if ($skip_filtering){
    # HUMAN GENOME @chroms = (1..22,'X','Y','MT'); # this is currently using chromosomes for the human genome
    @chroms = (1..19,'X','Y','MT'); # MOUSE GENOME this is currently using chromosomes for the mouse genome
}
else{ # default
    @chroms = detect_chroms();
}

# Keeping a record of chromosomes for which we have have SNP information available
foreach my $c (@chroms){
	$chroms{$c} = 1;
}

if ($skip_filtering){
    print "Using the following chromosomes (HARDCODED IN!!!):\n";
}
else{
    print "Using the following chromosomes (detected from VCF file >>$vcf_file<<):\n";
}
print join ("\t",@chroms),"\n\n";

### Determining and Filtering homozygous high-confidence SNPs for the strain in question
if ($skip_filtering){
    warn "Skipped reading the VCF file and filtering SNPs again (specified by user)\n\n";
}
else{
    filter_relevant_SNP_calls_from_VCF($strain,$strain_index,'1'); # the last number is the strain identity, here the first strain
    warn "Finished filtering and writing out SNPs for strain $strain\n\n";
}


### Storing the entire genome sequence
my %chromosomes; # genomic sequence
read_genome_into_memory($parent_dir);

### Create modified genome
my $new_n_total = 0;
my $new_snp_total = 0;
my $already_total = 0;
my $low_confidence = 0;

# Writing a genome generation report file
my $report = "${strain}_genome_preparation_report.txt";
open (REPORT,'>',$report) or die "Failed to write to file $report: $!\n";

for my $chr (sort keys %chromosomes) {

	# If there SNPs associated with the current chromosome, modify the genomic sequence
	# this may be N-masking, full sequence or both
	if (exists $chroms{$chr} ){
		# warn "Got SNP information for chromosome $chr. Creating modified chromosome\n";
		create_modified_chromosome($chr,$strain);
	}
	else{
		# warn "Got no SNP information for chromosome $chr. Printing sequence only...\n";
		if ($nmasking){
			write_SNP_chromosome($chr,$chromosomes{$chr},1,$strain);
		} 
		if ($full_sequence){
			write_SNP_chromosome($chr,$chromosomes{$chr},0,$strain);
		}
	}
}

if ($nmasking){
    warn "\n\nSummary\n$new_n_total Ns were newly introduced into the N-masked genome for strain $strain in total\n";
    print REPORT "\nSummary\n$new_n_total Ns were newly introduced into the N-masked genome for strain $strain in total\n";
}

if ($full_sequence){
    warn "$new_snp_total SNPs were newly introduced into the full sequence genome version for strain $strain in total\n\n";
    print REPORT "$new_snp_total SNPs were newly introduced into the full sequence genome version for strain $strain in total\n";
}
warn "\n";
close REPORT;

### Create modified genome 2
if ($dual_hybrid){

    warn "Now starting to work on strain 2 [$strain2]\n";
    # Need to read and filter the SNP file once more for Strain 2
    ### Determining and Filtering homozygous high-confidence SNPs for the strain in question                                                                             
    if ($skip_filtering){                                                                                                                                            
		warn "Skipped reading the VCF file and filtering SNPs again for strain 2 (specified by user)\n\n";                                                                               
    }                                                                                                                                                                            
    else{
		filter_relevant_SNP_calls_from_VCF($strain2,$strain2_index,'2'); # the last number is the strain identity, here the second strain for dual hybrids
		warn "Finished filtering and writing out SNPs for strain 2 [$strain2]\n\n";
    }                                                                                                                                                                          
    
    $new_n_total = 0;                                                                                      
    $new_snp_total = 0;
    $already_total = 0;
    $low_confidence = 0;
    
    # Writing a genome generation report file for Strain 2                                                                                                                   
    my $report = "${strain2}_genome_preparation_report.txt";

    open (REPORT,'>',$report) or die "Failed to write to file $report: $!\n";    
	for my $chr (sort keys %chromosomes) {

		if (exists $chroms{$chr} ){
			# warn "Got SNP information for chromosome $chr. Creating modified chromosome\n";
			create_modified_chromosome($chr,$strain2);
		}
		else{
			# warn "Got no SNP information for chromosome $chr. Printing sequence only...\n";
			if ($nmasking){
				write_SNP_chromosome($chr,$chromosomes{$chr},1,$strain2);
			} 
			if ($full_sequence){
				write_SNP_chromosome($chr,$chromosomes{$chr},0,$strain2);
			}
		}	
    }

    if ($nmasking){
		warn "\nSummary\n$new_n_total Ns were newly introduced into the N-masked genome for strain 2 [$strain2] in total\n";
		print REPORT "\nSummary\n$new_n_total Ns were newly introduced into the N-masked genome for strain 2 [$strain2] in total\n";
    }

    if ($full_sequence){
		warn "$new_snp_total SNPs were newly introduced into the full sequence genome version for strain 2 [$strain2] in total\n\n";
		print REPORT "$new_snp_total SNPs were newly introduced into the full sequence genome version for strain 2 [$strain2] in total\n";
    }
    close REPORT;

    ### Final dual genome contruction report
    $report = "${strain}_${strain2}_dual_hybrid.genome_preparation_report.txt";
    open (REPORT,'>',$report) or die "Failed to write to file $report: $!\n";
    
    ### Now just need to construct the dual hybrid genome
    determine_SNPs_between_strain_and_strain2();
    warn "done...\n";
  
    ### Resetting the genomic reference sequence
    %chromosomes = (); # genomic sequence
    warn "Changing the genomic reference sequence to the full sequence of strain $strain\n\n";
    print REPORT "Changing the genomic reference sequence to the full sequence of strain $strain\n\n";
    
    $genome_folder = "${parent_dir}/${strain}_full_sequence/";
    read_genome_into_memory("${parent_dir}");
    
    warn "Reading and storing all new SNPs with Ref/SNP: $strain/$strain2 from '$new_ref_snp_annoations'\n";
    print REPORT "Reading and storing all new SNPs with Ref/SNP: $strain/$strain2 from '$new_ref_snp_annoations'\n";
    read_new_snp_annotation($new_ref_snp_annoations,$strain,$strain2);

    $new_n_total = 0;                                                                                      
    $new_snp_total = 0;
    $already_total = 0;
    $low_confidence = 0; 

    #for my $chr (@chroms) {
		# TODO: here we need to loop through %chroms and not @chroms
	

	for my $chr (sort keys %chromosomes) {

		if (exists $chroms{$chr} ){
			warn "Got SNP information for chromosome $chr. Creating modified chromosome\n";
			create_modified_chromosome_dual_hybrid($chr,$strain,$strain2);
		}
		else{
			warn "Got no SNP information for chromosome $chr. Printing sequence only...\n";
			if ($nmasking){
				write_SNP_chromosome($chr,$chromosomes{$chr},1,"${strain}_${strain2}_dual_hybrid.based_on_${genome_build}");
			} 
			if ($full_sequence){
				write_SNP_chromosome($chr,$chromosomes{$chr},0,"${strain}_${strain2}_dual_hybrid.based_on_${genome_build}");
			}
		}
    }

    if ($nmasking){
		warn "\nSummary\n$new_n_total Ns were newly introduced into the N-masked genome for strain/strain 2 [${strain}/$strain2] in total\n";
		print REPORT "\nSummary\n$new_n_total Ns were newly introduced into the N-masked genome for strainstrain 2 [${strain}$strain2] in total\n";
    }

    if ($full_sequence){
		warn "$new_snp_total SNPs were newly introduced into the full sequence genome version for strainstrain 2 [${strain}/$strain2] in total\n\n";
		print REPORT "$new_snp_total SNPs were newly introduced into the full sequence genome version for strainstrain 2 [${strain}/$strain2] in total\n";
    }
    close REPORT;

    
}
warn "All done. Genome(s) are now ready to be indexed with your favourite aligner!\nFYI, aligners shown to work with SNPsplit are Bowtie2, STAR, HISAT2, HiCUP and Bismark (STAR and Hisat2 require disabling soft-clipping, please check the SNPsplit manual for details)\n\n";      


#############################################################
### SUBROUTINES
#############################################################

sub read_new_snp_annotation {
    
    my ($file,$strain,$strain2) = @_;
    warn "Reading $strain/$strain2 SNPs from file '$file'\n"; sleep(1);
    
    unless (-e $file) {
	die "Couldn't find SNP file '$file'\n";
    }

    if ($file =~ /gz$/){
	open (IN, "gunzip -c $file |") or die $!;
    }
    else{
	open (IN,$file) or die $!;
    }
    my $count = 0;

    while (<IN>) {
	chomp;
	$count++;
	if ($count %1000000 == 0){
	    warn "Processed $count lines so far\n";
	}

	#  warn "$_\n"; sleep(1);
	my (undef,$chr,$pos,$strand,$allele) = split(/\t/);

	unless ($allele){
	    warn "'$_'\n"; sleep(1);
	}
	# warn "$allele\n";
	# ref here is the $strain sequence, SNP is the $strain2 sequence
	my ($ref,$snp);
	
	if ($allele =~ /^([GATC])\/([GATC])$/) {
	    $ref = $1;
	    $snp = $2;
	}
	else {
	    warn "Skipping allele '$allele'\n";
	    next;
	}
	
    if ($strand == -1) {
	$ref =~ tr/GATC/CTAG/;
	$snp =~ tr/GATC/CTAG/;
    }

	$snps_dual_genome{$chr} -> {$pos}-> {ref} = $ref;
	$snps_dual_genome{$chr} -> {$pos}-> {snp} = $snp;
    }
    
  close IN or die;

}

sub determine_SNPs_between_strain_and_strain2{

    warn "Determining new Ref [$strain] and SNP [$strain2] annotations\n";
    warn "============================================================\n\n";
    my $out_strain = "${strain}_specific_SNPs.${genome_build}.txt";
    open (OUT_STRAIN,'>',$out_strain) or die $!;
    warn "Writing $strain specific SNPs (relative to the $genome_build reference) to >>$out_strain<<\n";

    my $out_strain2 = "${strain2}_specific_SNPs.${genome_build}.txt";
    open (OUT_STRAIN2,'>',$out_strain2) or die $!;
    warn "Writing $strain2 specific SNPs (relative to the $genome_build reference) to >>$out_strain2<<\n";
    
    my $out_common = "${strain}_${strain2}_SNPs_in_common.${genome_build}.txt";
    open (OUT_COMMON,'>',$out_common) or die $!;
    warn "Writing SNPs in common between $strain and $strain2 (relative to the $genome_build reference) to >>$out_common<<\n";
    
    my $all_strain_strain2 = "all_${strain2}_SNPs_${strain}_reference.based_on_${genome_build}.txt";
    open (ALL_STRAIN_STRAIN2,'>',$all_strain_strain2) or die $!;
    warn "Writing all new SNPs >>$strain/$strain2 to >>$all_strain_strain2<<\n\n";
    $new_ref_snp_annoations = $all_strain_strain2; # required for N-masking etc.

    read_snp_files();
    
}

sub read_snp_files{
    
    unless (-e "$snp_file_strain"){
	die "Expected SNP file [$snp_file_strain] for strain $strain did not exist! Please make sure that it is present in the current working directory\n\n";
    }
    unless (-e "$snp_file_strain2"){
	die "Expected SNP file 2 [$snp_file_strain2}] for strain $strain2 did not exist! Please make sure that it is present in the current working directory\n\n";
    }

    
    if ($snp_file_strain =~ /\.gz$/){
	open (SNP_STRAIN,"gunzip -c $snp_file_strain |") or die "Failed to read from gzipped file $snp_file_strain: $!\n\n";
    }
    else{
	open (SNP_STRAIN,"$snp_file_strain") or die "Failed to read from $snp_file_strain: $!\n\n";
    }
    
    if ($snp_file_strain2 =~ /\.gz$/){
	open (SNP_STRAIN2,"gunzip -c $snp_file_strain2 |") or die "Failed to read from gzipped file $snp_file_strain2: $!\n\n";
    }
    else{
	open (SNP_STRAIN2,"$snp_file_strain2") or die "Failed to read from file $snp_file_strain2: $!\n\n";
    }
    
    ### READING FROM SNP FILE FOR STRAIN 1 (these are only high confidence SNPs)
    warn "Storing SNP positions for strain $strain provided in '$snp_file_strain'\n";
    sleep (1);
    
    my $snp_count_strain = 0;

    while (<SNP_STRAIN>){
	++$snp_count_strain;
	chomp;
	my ($chr,$pos,$diff)  = (split /\t/)[1,2,4];
	my ($ref,$snp) = (split /\//,$diff);
	# warn "$chr\t$pos\tRef: $ref\tSNP: $snp\n"; sleep(1);
	
	$snps{$chr}->{$pos}->{ref} = $ref;
	$snps{$chr}->{$pos}->{snp} = $snp;
	$snps{$chr}->{$pos}->{read} = 0;
	
    }
    warn "Stored $snp_count_strain positions in total\n\n";

    ### READING FROM SNP FILE FOR STRAIN 2

    warn "Now reading and comparing SNP positions for strain $strain2 provided in '$snp_file_strain2'\n";
    
    my $snp_count_strain2 = 0;
    my $same = 0;
    my $different = 0;
    my $unique_ref = 0; # unique ref here means the new Reference, i.e. Strain
    my $unique_SNP = 0; # unique SNP here means the new SNP genome, i.e. Strain2
    my $confidence_discrepancy = 0; # a measure for how many times a SNP was found as homozygous in both strains but with low confidence in one of them

    while (<SNP_STRAIN2>){
	
	++$snp_count_strain2;
	chomp;
	
	my ($chr,$pos,$diff)  = (split /\t/)[1,2,4];
	my $location = join (':',$chr,$pos);
	
	my ($ref,$snp) = (split /\//,$diff);
	# warn "$chr\t$pos\tRef: $ref\tSNP: $snp\n"; sleep(1);
	
	if (exists $snps{$chr}->{$pos} ){
	    
	    $snps{$chr}->{$pos}->{read}++; # SNP is present in both genomes as high confidence SNP.
	    
	    unless ($ref eq $snps{$chr}->{$pos}->{ref}){
			warn "reference was different for the same position!!!\n";
	    }
	    # The SNP compared to the GRCm39 genome is the same in SNP=Strain2 ($snp) and Ref=Strain ($snps{$chr}->{$pos}->{snp})
	    if ($snp eq $snps{$chr}->{$pos}->{snp}){
			++$same;
			print OUT_COMMON "$_\n";
			# warn "SNP is the same in Ref and SNP. Printing to SNPs in common\n";
			next;
	    }
	    else{
			++$different;
			# warn "GRCm39 sequence:\t\t$ref\n";
			# warn "Strain (=new Ref) sequence:\t$snps{$chr}->{$pos}->{snp}\n";
			# warn "SNP (=new SNP) sequence:\t\t$snp\n";
			# sleep(1);
			
			### we need a new SNP format where Ref/SNP is now Strain/Strain2
			my $new_snp = "$snps{$chr}->{$pos}->{snp}/$snp";
			# warn "New $strain/$strain2 SNP is: $new_snp\n";
			# sleep (1);
			if ($new_snp){
				print ALL_STRAIN_STRAIN2 "$different\t$chr\t$pos\t1\t$new_snp\n";
			}
			else{
				warn "'$new_snp' is empty, skipping\n";
			}
	    }
	}
	else{
	    # Now we need to check whether the SNP was also present but failing the filter in Strain 1
	    if (exists $homozygous_SNPs{$location}->{strain1_filter}){
		# warn "Strain 1: $location\t$homozygous_SNPs{$location}->{strain1_genotype}\t$homozygous_SNPs{$location}->{strain1_filter}\n";
		# warn "Strain 2: $location\t$homozygous_SNPs{$location}->{strain2_genotype}\t$homozygous_SNPs{$location}->{strain2_filter}\n";
		if ($homozygous_SNPs{$location}->{strain1_filter} eq 1){
		    # warn "Fine, positions was high confidence\n";
		}
		else{ # if the position failed the filter we move on irrespective of what the genotype was
		    ++$confidence_discrepancy;
		    # warn "Strain 1: $location\t$homozygous_SNPs{$location}->{strain1_genotype}\t$homozygous_SNPs{$location}->{strain1_filter}\n";
		    # warn "Strain 2: $location\t$homozygous_SNPs{$location}->{strain2_genotype}\t$homozygous_SNPs{$location}->{strain2_filter}\n";
		    # warn "Confidence in Strain 1 SNP call was low, skipping this position irrespective of genotype\n\n";
		    next;
		}
		# warn "\n";
	    }
	    else{
		# warn "SNP did not exist in hash\n";
	    }

	    ++$unique_SNP; 
	    # warn "SNP is unique to Strain2. Printing...\n";
	    # warn "$genome_strain sequence: $ref\n";
	    # warn "SNP (=Strain2) sequence: $snp\n"; sleep(1);
	    print OUT_STRAIN2 "$_\n"; # Strain has the same sequence as Black6 ($genome_build)
	    if ($_){
		print ALL_STRAIN_STRAIN2 "$_\n";
	    }
	    else{
		warn "'$_' is empty, skipping\n";
	    }
	}
	# last if ($snp_count_strain2 == 10000);
    }
    
    warn "Finally, looking at new reference [$strain] specific reads...\n"; sleep (1);
    
    foreach my $chr (keys %snps){
	foreach my $pos (keys %{$snps{$chr}}){
	    my $location = join (':',$chr,$pos);
	    
	    if ($snps{$chr}->{$pos}->{read} == 0){ # present only for Strain 1
		
		# Now we need to check whether the SNP was also present but failing the filter in Strain 2
		if (exists $homozygous_SNPs{$location}->{strain2_filter}){
		    if ($homozygous_SNPs{$location}->{strain2_filter} eq 1){    
			# warn "Strain 1: $location\t$homozygous_SNPs{$location}->{strain1_genotype}\t$homozygous_SNPs{$location}->{strain1_filter}\n";
			# warn "Strain 2: $location\t$homozygous_SNPs{$location}->{strain2_genotype}\t$homozygous_SNPs{$location}->{strain2_filter}\n";
			# warn "Fine, genotype call was good in both strains\n";
		    }
		    else{
			# warn "Strain 1: $location\t$homozygous_SNPs{$location}->{strain1_genotype}\t$homozygous_SNPs{$location}->{strain1_filter}\n";
			# warn "Strain 2: $location\t$homozygous_SNPs{$location}->{strain2_genotype}\t$homozygous_SNPs{$location}->{strain2_filter}\n";
			# warn "Confidence in Strain 2 SNP call was low, skipping this position irrespective of genotype\n\n";   
			++$confidence_discrepancy;
			next;
		    }
		}
		
		++$unique_ref;
		
		### here we need to use the SNP position (i.e. Strain sequence) as new reference and the GRCm39 sequence as the SNP (i.e. where Strain2 is the same as GRCm39)
		print OUT_STRAIN "$chr\t$pos\t$snps{$chr}->{$pos}->{snp}/$snps{$chr}->{$pos}->{ref}\n"; # writing out an annotation track Strain vs GRCm39
		
		### here we need to use the SNP position (i.e. Strain sequence) as new reference and the GRCm39 sequence as the SNP (i.e. where Strain2 is the same as GRCm39)
		print ALL_STRAIN_STRAIN2  "${strain}_${unique_ref}\t$chr\t$pos\t1\t$snps{$chr}->{$pos}->{snp}/$snps{$chr}->{$pos}->{ref}\n";
	    }
	    
	    if ($snps{$chr}->{$pos}->{read} >= 2){
		die "SNP was present at least twice: $chr\t$pos\tcount: $snps{$chr}->{$pos}->{snp}\n\n";
	    }
	}
    }
    
    warn "\nLooked at positions from new Reference strain [$strain]:\t\t$snp_count_strain\n";
    warn "Compared positions from new SNP strain [$strain2]:\t\t$snp_count_strain2\n";
    warn "======================================================\n";
    warn "SNPs were the same in Ref and SNP genome (not written out):\t$same\n";
    warn "SNPs were present in both Ref and SNP genome but had a different sequence:\t$different\n";
    warn "SNPs were low confidence in one strain and thus ignored:\t$confidence_discrepancy\n";
    warn "SNPs were unique to Ref [$strain]:\t\t\t\t$unique_ref\n";
    warn "SNPs were unique to SNP [$strain2]:\t\t\t\t$unique_SNP\n\n";

    print REPORT "Looked at positions from new Reference strain [$strain]:\t\t$snp_count_strain\n";
    print REPORT "Compared positions from new SNP strain [$strain2]:\t\t$snp_count_strain2\n";
    print REPORT "======================================================\n";
    print REPORT "SNPs were the same in Ref and SNP genome (not written out):\t$same\n";
    print REPORT "SNPs were present in both Ref and SNP genome but had a different sequence:\t$different\n";
    print REPORT "SNPs were low confidence in one strain and thus ignored:\t$confidence_discrepancy\n";
    print REPORT "SNPs were unique to Ref [$strain]:\t\t\t\t$unique_ref\n";
    print REPORT "SNPs were unique to SNP [$strain2]:\t\t\t\t$unique_SNP\n\n";

    close OUT_STRAIN;
    close OUT_STRAIN2;
    close OUT_COMMON;
    close ALL_STRAIN_STRAIN2;
 
}

sub create_modified_chromosome {
    
    my ($chr,$strain) = @_; # $strain may be strain 1 or 2
    warn "Processing chromosome $chr (for strain $strain)\n";
    unless ($chromosomes{$chr}){
		warn "\nThe chromosome name given in the VCF file was '$chr' and was not found in the reference genome.\nA rather common mistake might be that the VCF file was downloaded from Ensembl (who use chromosome names such as 1, 2, X, MT)\nbut the genome from UCSC (who use chromosome names such as chr1, chr2, chrX, chrM)\n";
		warn "The chromosome names in the reference genome folder were:\n";
		foreach my $c (sort keys %chromosomes){
			warn "$c\n";
		}
		die "[FATAL ERROR] Please ensure that the same version of the genome is used for both VCF annotations and reference genome (FastA files). Exiting...\n\n";
    }

    my $sequence = $chromosomes{$chr};
    my $n_sequence;
    
    if ($nmasking){
		$n_sequence = $sequence;
    }

    my @snps = @{read_snps($chr,$strain)};
    unless (@snps){
		@snps = ();
		warn "Clearing SNP array...\n"
    }
    
    my $count = 0;

    my $lastPos = 0;

    my $already = 0;
    my $warn = 0;
    my $new_n = 0;
    my $new_snp = 0;

    foreach my $snp (@snps) {
	# Apply the SNP
	++$count;
	# warn "$snp->[0]\t$snp->[1]/$snp->[2]\n";
	if ($snp->[0] == $lastPos) {
	    # Duplicate SNP
	    next;
	}
      
	$lastPos = $snp->[0];
	
	# Check if the reference base is the same as the SNP base
	if (substr ($sequence,$snp->[0]-1,1) eq $snp->[2]) {
	    # warn "Skipping $snp->[0] $snp->[1]/$snp->[2] since the ref and SNP base are the same\n";
	    ++$already;
	    next;
	}
      
	# Check the reference base is correct
	if (substr ($sequence,$snp->[0]-1,1) ne $snp->[1]) {
	    # warn "Skipping $snp->[0] $snp->[1]/$snp->[2] since the reference base didn't match\n";
	    $warn++;
	    next;
	}
      
		### Ref/Alt bases are matching, so we can proceed to changing the ref base for the SNP base or Ns (N-masking)
		
		### N-masking 
		if ($nmasking){ # default
			my $return =  substr($n_sequence,($snp->[0])-1,1,'N');  # Replacing the base with 'N'
			unless ($return){
			warn "Replacing failed...\n";
			}
			++$new_n;
		}
	
		if ($full_sequence){
			my $return = substr($sequence,$snp->[0]-1,1,$snp->[2]); # Replacing the reference with the SNP base
			unless ($return){
				warn "Replacing failed...\n";
			}
			++$new_snp;
		}
    }
  
    $new_n_total += $new_n;
    $new_snp_total += $new_snp;
    
    $already_total += $already;
    if ($nmasking){ 
		write_SNP_chromosome($chr,$n_sequence,1,$strain);
    }
    if ($full_sequence){
		write_SNP_chromosome($chr,$sequence,0,$strain);
    }
    
    warn "$count SNPs total for chromosome $chr\n";
    if ($nmasking){ # default
		warn "$new_n positions on chromosome $chr were changed to 'N'\n";
		print REPORT "$new_n positions on chromosome $chr were changed to 'N'\n";
    }
    if ($full_sequence){
		warn "$new_snp reference positions on chromosome $chr were changed to the SNP alternative base\n\n";
		print REPORT "$new_snp reference positions on chromosome $chr were changed to the SNP alternative base\n\n";
    }
    warn "\n";

}

sub create_modified_chromosome_dual_hybrid {
    
    my ($chr,$strain,$strain2) = @_;
    warn "Processing chr$chr (to create new genome for $strain/$strain2)\n";
    
    my $sequence = $chromosomes{$chr};
    my $n_sequence;
    
    if ($nmasking){
	$n_sequence = $sequence;
    }

    my $count = 0;

    my $lastPos = 0;

    my $already = 0;
    my $warn = 0;
    my $new_n = 0;
    my $new_snp = 0;

   foreach my $pos (keys %{$snps_dual_genome{$chr}}) {

       # Apply the SNP
       ++$count;

       if ($pos == $lastPos) {
	   # Duplicate SNP
	   next;
       }
    
       $lastPos = $pos;
	
       # Check if the reference base is the same as the SNP base
       if (substr ($sequence,$pos-1,1) eq $snps_dual_genome{$chr}->{$pos}->{snp}) {
	   # warn "Skipping $pos $snps_dual_genome{$chr}->{$pos}->{ref}/$snps_dual_genome{$chr}->{$pos}->{snp} since the ref and SNP base are the same\n";
	   ++$already;
	   next;
       }
      
       # Check the reference base is correct
       if (substr ($sequence,$pos-1,1) ne $snps_dual_genome{$chr}->{$pos}->{ref}) {
	   # warn "Skipping $pos $snps_dual_genome{$chr}->{$pos}->{ref}/$snps_dual_genome{$chr}->{$pos}->{snp} since the reference base didn't match\n";
	   $warn++;
	   next;
       }
       
       ### Ref/Alt bases are matching, so we can proceed to changing the ref base for the SNP base or Ns (N-masking)
	
       ### N-masking 
       if ($nmasking){ # default
	   my $return =  substr($n_sequence,$pos-1,1,'N');  # Replacing the base with 'N'
	   unless ($return){
	       warn "Replacing failed...\n";
	   }
	   ++$new_n;
       }
       
       if ($full_sequence){
			my $return = substr($sequence,$pos-1,1,$snps_dual_genome{$chr}->{$pos}->{snp}); # Replacing the reference with the SNP base
			unless ($return){
				warn "Replacing failed...\n";
			}
	   	++$new_snp;
       }
   }
  
    $new_n_total += $new_n;
    $new_snp_total += $new_snp;
    
    $already_total += $already;
    if ($nmasking){ 
		write_SNP_chromosome($chr,$n_sequence,1,"${strain}_${strain2}_dual_hybrid.based_on_${genome_build}");
    }
    if ($full_sequence){
		write_SNP_chromosome($chr,$sequence,0,"${strain}_${strain2}_dual_hybrid.based_on_${genome_build}");
    }
    
    warn "$count SNPs total for chromosome $chr\n";
    if ($nmasking){ # default
	warn "$new_n positions on chromosome $chr were changed to 'N'\n";
	print REPORT "$new_n positions on chromosome $chr were changed to 'N'\n";
    }
    if ($full_sequence){
	warn "$new_snp reference positions on chromosome $chr were changed to the SNP alternative base\n\n";
	print REPORT "$new_snp reference positions on chromosome $chr were changed to the SNP alternative base\n\n";
    }
    warn "\n";

}


sub write_SNP_chromosome {

    my ($chr,$sequence,$nm,$strain) = @_; # $nm will discriminate between N-masking and full sequence output
    if ($nm){
		warn "Writing modified chromosome (N-masking)\n";
    }
    else{
		warn "Writing modified chromosome (incorporating SNPs)\n";
    }
    
    my $type;
    my $outfile;
    
    if ($nm){
		$type = 'N-masked';
		$outfile = "chr${chr}.N-masked.fa";
    }
    if ($nm == 0){
		$type = 'full_sequence';
		$outfile = "chr${chr}.SNPs_introduced.fa";
    }
    
    # warn "Starting sequence is ".length($sequence)." bp\n";
    if ($nm){
	warn "Writing N-masked output to: ${parent_dir}/${strain}_${type}/$outfile\n";
	unless (-d "${parent_dir}/${strain}_${type}/"){ # creating the output directory if required
	    mkdir "${parent_dir}/${strain}_${type}/";
	}      
	open (OUT,'>',"${parent_dir}/${strain}_${type}/${outfile}") or die "Failed to write to file ${parent_dir}/${strain}_${type}/${outfile}: $!\n\n";
	print OUT ">$chr\n";
    }
    elsif ($nm == 0){
	warn "Writing full sequence output to: ${parent_dir}/${strain}_${type}/$outfile\n";
	unless (-d "${parent_dir}/${strain}_${type}/"){ # creating the output directory if required
	    mkdir "${parent_dir}/${strain}_${type}/";
	}      
 	open (OUT,'>',"${parent_dir}/${strain}_${type}/${outfile}") or die "Failed to write to file ${parent_dir}/${strain}_${type}/${outfile}: $!\n\n";
	print OUT ">$chr\n";
    }
    else{
	warn "Running out of options...\n\n";
    }
    
    my $pos = 0;

    # Writing out chromosome files with 100 characters per line
    while ($pos < length($sequence)-100) {
	print OUT substr($sequence,$pos,100),"\n";
	$pos += 100;
    }
    print OUT substr($sequence,$pos),"\n"; # rest
    close OUT or die $!;
   
}


sub read_snps {
    
    my ($chr,$strain) = @_;
    my @snps = ();
    my $file = "${parent_dir}/SNPs_${strain}/chr$chr.txt";

    ### If the SNP folder doesn't exist we can be certain that something is going wrong
    unless (-d "${parent_dir}/SNPs_${strain}"){
	die "Folder >>${parent_dir}/SNPs_${strain}<< doesn't exist. Try losing the option --skip_filtering to generate the folder and SNP files from the VCF file\n\n";
    }

    ### not sure but I think for some chromosomes there might not be any SNP files, e.g. chr MT or chrY. In this case the sequence is written out again unmodified
    unless (-e $file) {
	warn "Couldn't find SNP file for chromosome '$chr' '$file' didn't exist. Skipping...\n";
	return \@snps;
    }
    warn "Reading SNPs from file $file\n";
    
    open (IN,$file) or die $!;

    while (<IN>) {
	$_ =~ s/\r//; # Windows line endings...
	chomp;
	# warn "$_\n"; sleep(1);
	next unless ($_);
	
	my (undef,undef,$pos,$strand,$allele) = split(/\t/);
	# warn "$pos , $strand , $allele\n";
	next unless ($allele);
	
	my ($ref_allele,$snp_allele);

	if ($allele =~ /^([GATC])\/([GATC])$/) {
	    $ref_allele = $1;
	    $snp_allele = $2;
	}
	else {
	    warn "Skipping allele '$allele' as it appears to contain non DNA bases (only G,A,T,C allowed)\n";
	    next;
	}
	
	if ($strand == -1) { # if the strand is given as -1 it means that the SNP is on the reverse strand and thus needs reverse-coplementing
	    $ref_allele =~ tr/GATC/CTAG/;
	    $snp_allele =~ tr/GATC/CTAG/;
	}
	
	# warn  "$pos , $ref_allele , $snp_allele\n"; sleep(1);
	push @snps,[$pos,$ref_allele,$snp_allele];
    }

    # sorting snps
    @snps = sort {$a->[0] <=> $b->[0]} @snps;

    return \@snps;
    
    close IN or warn "Failed to close filehandle IN for file $file: $!\n\n";

}


###

sub filter_relevant_SNP_calls_from_VCF{
    
    my ($strain, $strain_index, $strain_identity) = @_;    
    
    if ($vcf_file =~ /gz$/){
		open (IN,"gunzip -c $vcf_file |") or die "Failed to open file '$vcf_file': $!\n";
    } 
    else{
		open (IN, $vcf_file) or die "Failed to read Input VCF file '$vcf_file': $!\n";
    }
    
    my %all_SNPs; # storing filtered SNPs
    my $count = 0;

    my $other = 0;
    my $too_many = 0;
    
    my %fhs;
    my $hcg_count = 0;
    my $low_confidence = 0;
    my $same = 0;
    my $homozygous = 0;

	my $indel_pos = 0;
  
	my $format_index;    # required to get extract entries from FORMAT field
	my $info_index;      # required to look at INFO field, e.g. for INDELs
	my $gt_index;        # required to get GENOTYPE
	my $fi_index;        # required to get FILTER value

    my $dir = "SNPs_$strain";
    unless (-d $dir){
		warn "Folder '$dir' doesn't exist. Creating it for you...\n\n";
		mkdir $dir or die "Failed to created directory $dir\n: $!\n\n";
	}
    
    # Opening filehandles for the SNP files
    for my $chr (@chroms) {
		my $filename = "SNPs_$strain/chr".$chr.'.txt';
		open (my $fh,'>',$filename) or die "Couldn't open filehandle $!\n";
		$fhs{$chr} = $fh;
		print {$fhs{$chr}} ">$chr\n";
    }
    
    while (<IN>){
		$_ =~ s/(\r|\n)//g; # removing end of line characters

		#	warn "$_\n"; sleep(1);
		next if ($_ =~ /^\#\#/); # filters out header information lines
		if ($_ =~ /^\#CHROM/){ # Table Header
			my ($name) = (split /\t/)[$strain_index];
			warn "Analysing SNP fields for name >$name<\n";

			my @format_fields = split /\t/;
			my $field_index = 0;
			foreach my $field (@format_fields){
				# warn "$field_index\t$field\n";#
				if ($field eq "FORMAT"){#
					$format_index = $field_index;
				}
				if ($field eq "INFO"){#
					$info_index = $field_index;
				}
					$field_index++;
			}
			
			if (defined $format_index){
				warn "Using FORMAT field index: $format_index\n";
			}
			else{
				die "Failed to extract index of field 'FORMAT'. Hmmm...";
			}
			if (defined $info_index){
				warn "Using INFO field index: $info_index\n";
			}
			else{
				die "Failed to extract index of field 'INFO'. Hmmm...";
			}
			next;	
		}

		$count++;
		if ($count%1000000 ==0){
			warn "processed $count lines\n";
		}
		# warn "$_\n"; sleep(1);
		# last if ($count == 10000);
		
		my ($chr,$pos,$ref,$alt,$info,$format,$strain) = (split /\t/)[0,1,3,4,$info_index,$format_index,$strain_index];
		# warn "$chr , $pos , $ref , $alt , $info , $format, $strain\n"; sleep(1);
		
		# 06 April 2021: adapting for variable VCF format
		unless (defined $gt_index){ # only needed once
			warn "GT index not defined, checking...\n";
			my %strain_hash;
			my @keys;
			my @values;
			my $i;
			
			@keys = split/:/,$format;
			# warn  "Number of elements in keys:", scalar @keys;
			for( $i = 0; $i < scalar @keys; $i++){
				$strain_hash{$keys[$i]} = $i;
			}
			#  foreach my $k (keys %strain_hash){
			# 		warn "$k\t$strain_hash{$k}\n";
			#  }
			
			if(exists($strain_hash{'GT'})){
				$gt_index = $strain_hash{'GT'};
				warn "Setting GT index to >>$strain_hash{'GT'}<<\n";
			}
			else{
				die "Failed to extract GT index. I am afraid this will need some manual looking into..."
			}
			if(exists($strain_hash{'FI'})){
				$fi_index = $strain_hash{'FI'};
				warn "Setting FI index to >>$strain_hash{'FI'}<<\n";
			}
			else{
				die "Failed to extract FI index. I am afraid this will need some manual looking into..."
			}
		}

		## Now getting the
		my $gt; # genotype call
		my $fi;	# FILTER call
		($gt,$fi) = (split /:/,$strain)[$gt_index,$fi_index];
		 
		#  warn "genotype: $gt\nfilter:   $fi\n";
		#  warn "$fi\n"; sleep(1);
		# The v7 file contains both SNPs as well as INDELS. We are only interested in SNPs here,
		# so removing all INDELs
		if ($v7){
			if ($info =~ /INDEL/){
				# warn "This variant is an INDEL:\n$info\nRemoving...\n"; sleep(1);
				++$indel_pos;
				next;
			}
		}

		# $gt is the Genotype:
		# '.'   = no genotype call was made
		# '0/0' = genotype is the same as the reference genome
		# '1/1' = homozygous alternative allele; can also be '2/2',
		# '3/3', etc. if more than one alternative allele is present.
		# '0/1' = heterozygous genotype; can also be '1/2', '0/2', etc.
		
		# $fi is FILTER, 1 for high confidence SNP, or 0 for low confidence
		
		### We are only looking for 1/1, 2/2 or 3/3 calls, and filter for high confidence as well
		
		# skipping if the reference base is not well defined in Black6
		if ($ref =~ /[^ATCG]/){ # reference base contained any non A, C, T, G characters or more than one base
			warn "ref was: $ref; skipping\n";
			next;
		}
		
		# skipping if the SNP is not well defined in the strain of interest
		if ($alt =~ /[^ATCG,]/){ # Alt base contained any non A, C, T, G characters or commas which separate several different variants
			# warn "SNP was: $alt; GT was $gt; FI was: $fi; skipping\n";
			++$too_many;
			next;
		}
		
		my $location = join (':',$chr,$pos);
		# warn "$location\n";   
		
		### For dual hybrids we will store all positions so we can add an additional filter later for positions that pass the filter (FI=1) in both strains
		if ($dual_hybrid){
			if ($fi ne 1){ # only adding this position if the FI is not a PASS
			if ($strain_identity == 1){
				# warn "Strain 1. Adding $location, $fi and $gt\n"; sleep(1);
				$homozygous_SNPs{$location}->{strain1_filter} = $fi;  
				$homozygous_SNPs{$location}->{strain1_genotype} = $gt;  
			}
			elsif($strain_identity == 2){
				# warn "Strain 2. Adding $location, $fi and $gt\n"; sleep(1);
				$homozygous_SNPs{$location}->{strain2_filter} = $fi;      
				$homozygous_SNPs{$location}->{strain2_genotype} = $gt; 
			}
			else{
				die "Strain was lacking the strain identity flag\n\n";
			}
	    }
	}
	
	# Filtering for genotype. If the genotype is the same as the reference we will move on to the next position
	if ($gt eq '0/0'){
	    ++$same;
	    # warn "same as reference\n";
	    next;
	}
	elsif ($gt eq '1/1'){
	    ++$homozygous;
	    if ($alt =~ /[^ATCG]/){ 	 
		# warn "homozygous alternative allele: >$alt<\n";
		my ($new_alt) = (split (/,/,$alt))[0];
		# warn "New ALT is $new_alt\n";
		# warn "genotype: $gt\nfilter:   $fi\n\n"; sleep(1);
		$alt = $new_alt;  
	    }
	}
	elsif ($gt eq '2/2'){
	    # warn "$gt\n";
	    ++$homozygous;
	    if ($alt =~ /[^ATCG]/){ 	 
		# warn "homozygous alternative allele: >$alt<\n";
		my ($new_alt) = (split (/,/,$alt))[1];
		# warn "New ALT is $new_alt\n\n"; sleep(1);
		$alt = $new_alt;    
	    }
	}
	elsif ($gt eq '3/3'){
	    # warn "$gt\n";
	    ++$homozygous;
	    if ($alt =~ /[^ATCG]/){
		# warn "homozygous alternative allele: >$alt<\n";
		my ($new_alt) = (split (/,/,$alt))[2];                                                                                                        
 		# warn "New ALT is $new_alt\n\n"; sleep(1);                                                                             
		$alt = $new_alt;   
	    }
	}
	else{
	    # this could be positions without genotype call ./. or any form of heterozygous call. In any case we will blacklist these positions and not use them for dual genomes
	    # we do skip these positions but keep them on the 'blacklist' for dual genomes
	    ++$other;
	    #  warn "$gt\n"; 
	    next;
	}

	### If we made it this far the SNP is a homozygous substitution in the current strain
	
	# Looking at the Filtering tag now
	# warn "$fi\n"; sleep(1);
	if ($fi == 1){
	    ++$hcg_count;
	   
	    my $SNP = join ("\t",$count,$chr,$pos,'1',join ("\/",$ref,$alt));
	    # field 4: 1 means top strand, -1 means reverse strand
	    # warn "$SNP\n";sleep(1);
	    
	    if (exists $all_SNPs{$location} ){
		# warn "SNP $all_SNPs{$location} position was present already\n";
	    }
	    else{
		$all_SNPs{$location} = $SNP;
	    }
	}
	else{
	    ++$low_confidence;
	    # warn "FI was $fi\n";
	    next;
	}
	
	# Output example
	# Variation ID    Chromosome name Position on Chromosome (bp)     Strand  Allele
	# rs2020560       10      98212004        1       A/T
	
	print {$fhs{$chr}} join ("\t",$count,$chr,$pos,'1',join ("\/",$ref,$alt),$strain),"\n";
	
    }
    
    # Writing a report file
    my $report = "${strain}_SNP_filtering_report.txt"; # $strain here may be strain 1 or strain 2
    open (REPORT,'>',$report) or die "Failed to write to file $report: $!\n";
    
    warn "\nSNP position summary for strain $strain (based on genome build $genome_build)\n";
    warn "="x75,"\n\n";
    warn "Positions read in total:\t$count\n\n";
	if ($v7){
		warn "$indel_pos\tPositions were INDELs (and hence skipped)\n";
	}
    warn "$homozygous\tSNP were homozygous. Of these:\n";
    warn "$hcg_count\tSNP were homozygous and passed high confidence filters and were thus included into the $strain genome\n";
    warn "\nNot included into $strain genome:\n";
    warn "$same\thad the same sequence as the reference\n";
    warn "$too_many\t\thad no clearly defined alternative base\n";
    warn "$other\t\tCalls were neither 0/0 (same as reference) or 1/1, 2/2, 3/3 (homozygous SNP)\n";
    warn "$low_confidence\t\twere homozygous but the filtering call was low confidence\n\n";

    print REPORT "SNP position summary for strain $strain (based on genome build $genome_build)\n";
    print REPORT "="x75,"\n\n";
    print REPORT "Positions read in total:\t$count\n\n";
	if ($v7){
		print REPORT "$indel_pos\tPositions were INDELs (and hence skipped)\n";
	}
    print REPORT "$homozygous\tSNP were homozygous. Of these:\n";
    print REPORT "$hcg_count\tSNP were homozygous and passed high confidence filters and were thus included into the $strain genome\n";
    print REPORT "\nNot included into $strain genome:\n";
    print REPORT "$same\thad the same sequence as the reference\n";
    print REPORT "$too_many\t\thad no clearly defined alternative base\n";
    print REPORT "$other\t\tCalls were neither 0/0 (same as reference) or 1/1, 2/2, 3/3 (homozygous SNP)\n";
    print REPORT "$low_confidence\t\twere homozygous but the filtering call was low confidence\n\n";
    
  
    # Also writing all SNP calls out to an all-SNP file
    my $all_SNPs = "all_SNPs_${strain}_${genome_build}.txt.gz";
    warn "Now printing a single list of all SNPs to >$all_SNPs<...\n";
    print REPORT "Printed a single list of all SNPs to >$all_SNPs<...\n";
    close (REPORT) or warn "Failed to close filehandle REPORT\n";

    if (-e $all_SNPs){
	warn "File '$all_SNPs' existed in the folder already, overwriting it...\n\n";
    }
    open (ALLSNP,"| gzip -c - > $all_SNPs") or die "Failed to write to file $all_SNPs: $!\n";
    
    foreach my $location (keys %all_SNPs){
	print ALLSNP "$all_SNPs{$location}\n";
    }
    warn "complete\n\n";
    close ALLSNP;   
}


sub detect_chroms{
    
    my %chrom; # detecting the chromosomes from the VCF file
    my @chrom;
    
    if ($vcf_file =~ /gz$/){
		open (DETECT,"gunzip -c $vcf_file |") or die "Failed to open file '$vcf_file': $!\n";
    }
    else{
		open (DETECT, $vcf_file) or die "Failed to read Input VCF file '$vcf_file': $!\n";
    }
    
    # warn "Detecting chromosomes from file '$vcf_file'\n\n";
    while (<DETECT>){
		$_ =~ s/(\r|\n)//g; # removing end of line characters
		last unless ($_ =~ /^\#/);
		
		if ($_ =~ /^\#\#contig/){ # filters header lines
			# warn "$_\n"; # sleep(1);
			$_ =~ /ID=(.+?),/; # changing to non-greedy match (in case there are several commas in that line...)
			my $chr = $1;
			# warn "Identified chromosome $chr\n";
			unless (exists $chrom{$chr}){
				$chrom{$chr}++;
			}
		}
	}
    
	foreach my $chr(keys %chrom){
		# warn "$chr\n"; sleep(1);
		push @chrom, $chr;
    }
    # close DETECT or warn "Failed to close filehandle DETECT: $!\n";
    return @chrom;
    
}

sub detect_strains{
    
    my $vcf_file = shift;
    my %strains; # detecting the available strains from the VCF file
    
    if ($vcf_file =~ /gz$/){
		open (STRAIN,"gunzip -c $vcf_file |") or die "Failed to open file '$vcf_file': $!\n";
    }
    else{
		open (STRAIN, $vcf_file) or die "Failed to read Input VCF file '$vcf_file': $!\n";
    }
    
    # warn "Detecting strains from file '$vcf_file'\n\n";
    while (<STRAIN>){
		$_ =~ s/(\r|\n)//g; # removing end of line characters
		next if ($_ =~ /^\#\#/);	
		if ($_ =~ /^\#CHROM/){ # header line listing all different strains
			# warn "$_\n"; sleep(1);
		}
	last unless ($_ =~ /^\#/); # everything from now on are the SNPs themselves
	
		my @strains = split (/\t/);
	
		foreach my $index(0..$#strains){
			next if ($index <= 8); # The first 8 fields are irrelevant: 
			#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  
			$strains{$strains[$index]} = $index; 
			# warn "$index\t$strains[$index]\n";
		}
    }
    
    # close STRAIN or warn "Failed to close filehandle STRAIN: $!\n";
    return %strains;

}

sub read_genome_into_memory{
    
    ## working directoy
    my $cwd = shift;
    
    ## reading in and storing the specified genome in the %chromosomes hash
    chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
    warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";

    my @chromosome_filenames =  <*.fa>;

    ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
    unless (@chromosome_filenames){
      	@chromosome_filenames =  <*.fasta>;
    }

    unless (@chromosome_filenames){
      	die "The specified reference genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
    }

    my $SQ_count = 0;

    foreach my $chromosome_filename (@chromosome_filenames){

	open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
	### first line needs to be a fastA header
	my $first_line = <CHR_IN>;
	chomp $first_line;
	$first_line =~ s/\r//;
	### Extracting chromosome name from the FastA header
	my $chromosome_name = extract_chromosome_name($first_line);
	my $sequence;

	while (<CHR_IN>){
		chomp;
		$_ =~ s/\r//; # removing carriage returns if present
		if ($_ =~ /^>/){
	
			### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
			if (exists $chromosomes{$chromosome_name}){
				print "chr $chromosome_name (",length $sequence ," bp)\n";
				die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
			}
			else {
				if (length($sequence) == 0){
					warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
				}
				print "chr $chromosome_name (",length $sequence ," bp)\n";
				$chromosomes{$chromosome_name} = $sequence;
			}
			### resetting the sequence variable
			$sequence = '';
			### setting new chromosome name
			$chromosome_name = extract_chromosome_name($_);
		}
		else{
			$sequence .= uc$_;
		}
	}
	
		### Processing last chromosome of a multi Fasta File or the only entry in case of single entry FastA files

		if (exists $chromosomes{$chromosome_name}){
			print "chr $chromosome_name (",length $sequence ," bp)\t";
			die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
		}
		else{
			if (length($sequence) == 0){
				warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
			}

			print "chr $chromosome_name (",length $sequence ," bp)\n";
			$chromosomes{$chromosome_name} = $sequence;
		}
    }
    print "\n";
    chdir $cwd or die "Failed to move to directory $cwd\n";

}

sub extract_chromosome_name {
    ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
    my $fasta_header = shift;
    if ($fasta_header =~ s/^>//){
	my ($chromosome_name) = split (/\s+/,$fasta_header);
	return $chromosome_name;
    }
    else{
	die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
    }
}


###################################################################


sub process_commandline{
    my $help;
    my $version;
    my $vcf_file;
    my $strain;
    my $strain2;
    my $list_strains;
    my $skip_filtering;
    my $full_sequence;
    my $genome_folder;
    my $nmasking;
    my $dual_hybrid;
    my $no_nmasking;
    my $genome_build;
	my $v7_MGP_file;
    
    my $command_line = GetOptions ('help|man'             => \$help,
				   'versions'             => \$version,
				   'strain=s'             => \$strain, 
				   'strain2=s'            => \$strain2,
				   'list_strains'         => \$list_strains,
				   'skip_filtering'       => \$skip_filtering,
				   'vcf_file=s'           => \$vcf_file,
				   'full_sequence'        => \$full_sequence,
				   'nmasking'             => \$nmasking,
				   'no_nmasking'          => \$no_nmasking,
				   'dual_hybrid'          => \$dual_hybrid,
				   'reference_genome=s'   => \$genome_folder,
				   'genome_build=s'       => \$genome_build,
				   'v7_VCF'               => \$v7_MGP_file,
	);
  
    ### EXIT ON ERROR if there were errors with any of the supplied options
    unless ($command_line){
		die "Please respecify command line options\n";
    }
    
    ### HELPFILE
    if ($help){
		print_helpfile();
		exit;
    }
    
    if ($version){
	print << "VERSION";
	
                               SNPsplit Genome Preparation
			           version: $pipeline_version
                             Copyright 2014-23, Felix Krueger
                                  Altos Bioinformatics
                        https://github.com/FelixKrueger/SNPsplit


VERSION
		 ;
   	exit;
    }

    if (defined $vcf_file){
		unless(-e $vcf_file){
			die "Input VCF file '$vcf_file' doesn't exist in the folder. Please check filenames and try again!\n\n";
		}
		if ($vcf_file =~ 'mgp_REL2005_snps_indels.vcf.gz'){
			warn "Setting option --v7_VCF as the supplied file is called 'mgp_REL2005_snps_indels.vcf.gz'\n\n";
			$v7_MGP_file = 1;
			sleep(1);
		}

		### March 2021: Now also accepting the v7 combined SNP and INDEL file
		if ($v7_MGP_file){
			my $tmp_vcf_name = $vcf_file;
			$tmp_vcf_name =~ s/.*\///; # removing path information
			
			if($tmp_vcf_name eq 'mgp_REL2005_snps_indels.vcf.gz') {
				warn "Using v7 MGP file 'mgp_REL2005_snps_indels.vcf.gz'. This file can be obtained from:\n";
				warn "ftp://ftp-mouse.sanger.ac.uk/REL-2004-v7-SNPs_Indels/mgp_REL2005_snps_indels.vcf.gz\n\n";
				warn "PLEASE NOTE:\n============\nThis file is currently not marked as Current_SNPs, so consider this approach experimental for the time being (as of 15 03 2021)\n\n";	
				sleep(1);
			}
			else{
				warn "Version 7 input file selected. Input VCF file '$vcf_file' doesn't appear to be file 'mgp_REL2005_snps_indels.vcf.gz'. If something goes wrong, I won't accept responsibility....!\n\n";
				sleep(3);
			}
		}
    }
    else{
		unless($skip_filtering){
			die "\nYou need to provide a VCF file detailing SNPs positions with '--vcf_file your.file' (e.g.: --vcf mgp.v5.merged.snps_all.dbSNP142.vcf.gz). Please respecify!\n\n";
		}
    }

    my $strain_index;
    my $strain2_index;
    unless($skip_filtering){
		my %strains = detect_strains($vcf_file);

		if ($list_strains){
			warn "\nAvailable genomes to choose from are:\n";
			warn "="x37,"\n";
			foreach my $strain(keys %strains){
			print "$strain\n";
			}
			warn "="x37,"\n";
			warn "\nPlease choose a strain using '--strain NAME' to continue.\n\n";
			exit;
		}

		### Strain 1 (required)
		if (defined $strain){
			if (exists $strains{$strain}){
				$strain_index = $strains{$strain};
				warn "Strain defined as '$strain' (strain index: $strain_index)\n";
			}
			else{
				warn "Strain name specified [$strain] does not match any of the available strain names!\n";
				warn "\nAvailable genomes to choose from are:\n";
				warn "="x37,"\n";	
				foreach my $strain(keys %strains){
					print "$strain\n";
				}
				warn "="x37,"\n";
				die "\nPlease double check the name and try again (using '--strain NAME')\n\n";
			}
		}
		else{
			warn "No strain specified!\n";
			warn "\nAvailable genomes to choose from are:\n";
			warn "="x37,"\n";	
			foreach my $strain(keys %strains){
				print "$strain\n";
			}
			warn "="x37,"\n";
			die "\nPlease choose one of the available strains using '--strain NAME' and try again\n\n";
		}
	
		### Strain 2 (optional)
		if (defined $strain2){
			unless ($dual_hybrid){
				warn "Strain 2 specified, setting option '--dual_hybrid'\n";
				$dual_hybrid = 1;
			}
		}
		if ($dual_hybrid){
			warn "Dual Hybrid strain selected\n";

			# this automatically sets the full_sequence option
			$full_sequence = 1;

			if (defined $strain2){
				if (exists $strains{$strain2}){
					$strain2_index = $strains{$strain2};
					warn "Strain2 defined as '$strain2' (strain2 index: $strain2_index)\n";
				}
				else{
					warn "Strain2 name specified [$strain2] does not match any of the available strain names!\n";
					warn "\nAvailable genomes to choose from are:\n";
					warn "="x37,"\n";	
					foreach my $strain(keys %strains){
						print "$strain\n";
					}
				warn "="x37,"\n";
				die "\nPlease double check the name and try again (using '--strain2 NAME')\n\n";
				}
			}
			else{
				warn "No strain 2 specified!\n";
				warn "\nAvailable genomes to choose from are:\n";
				warn "="x37,"\n";	
				foreach my $strain(keys %strains){
					print "$strain\n";
				}
				warn "="x37,"\n";
				die "\nPlease choose one of the available strains using '--strain2 NAME' and try again\n\n";
			}
	    
			# make sure that Strain 1 and Strain 2 are different from each other
			if ($strain eq $strain2){
				die "Strain 1 [$strain] and Strain 2 [$strain2] must be different from each other. Please respecify!\n\n"; 
			}
		}
    }

    ### GENOME FOLDER
    unless ($genome_folder){
	warn "Reference genome folder was not specified! Please use --reference_genome </genome/folder/>\n";
	# print_helpfile();
	exit;
    }
    my $parent_dir = getcwd();    

    ### checking that the genome folder, all subfolders and the required bowtie index files exist
    unless ($genome_folder =~/\/$/){
	$genome_folder =~ s/$/\//;
    }
    
    if (chdir $genome_folder){
	my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
	unless ($absolute_genome_folder =~/\/$/){
	    $absolute_genome_folder =~ s/$/\//;
	}
	warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n\n";
	$genome_folder = $absolute_genome_folder;
    }
    else{
	die "Failed to move to genome folder > $genome_folder <: $!\n\nSNPsplit_genome_preparation --help for more details\n\n";
    }
    chdir $parent_dir or die "Failed to move back to parent directory $parent_dir\n\n";

    ### N-masking mode of action
    if ($no_nmasking){
	$nmasking = 0;
	$full_sequence = 1;
    }
    else{
	$nmasking = 1; # This is the default
    }

    unless (defined $genome_build){
		$genome_build = 'GRCm39';
    }

    return ($vcf_file,$strain,$strain2,$strain_index,$strain2_index,$genome_folder,$skip_filtering,$nmasking,$full_sequence,$dual_hybrid,$genome_build,$v7_MGP_file);

}


sub print_helpfile{
  print <<EOF

  SYNOPSIS:

SNPsplit_genome_preparation is designed to read in a variant call files from the Mouse Genomes Project (e.g. this latest
file: 'mgp.v5.merged.snps_all.dbSNP142.vcf.gz') and generate new genome versions where the strain SNPs are either incorporated
into the new genome (full sequence) or masked by the ambiguity nucleo-base 'N' (N-masking).

SNPsplit_genome_preparation may be run in two different modes:

Single strain mode:

   1) The VCF file is read and filtered for high-confidence SNPs in the strain specified with --strain <name>
   2) The reference genome (given with --reference_genome <genome>) is read into memory, and the filtered high-
      confidence SNP positions are incorporated either as N-masking (default) or full sequence (option --full_sequence)

Dual strain mode:

   1) The VCF file is read and filtered for high-confidence SNPs in the strain specified with --strain <name>
   2) The reference genome (given with --reference_genome <genome>) is read into memory, and the filtered high-
      confidence SNP positions are incorporated as full sequence and optionally as N-masking
   3) The VCF file is read one more time and filtered for high-confidence SNPs in strain 2 specified with --strain2 <name>
   4) The filtered high-confidence SNP positions of strain 2 are incorporated as full sequence and optionally as N-masking
   5) The SNP information of strain and strain 2 relative to the reference genome build are compared, and a new Ref/SNP
      annotation is constructed whereby the new Ref/SNP information will be Strain/Strain2 (and no longer the standard
      reference genome strain Black6 (C57BL/6J))
   6) The full genome sequence given with --strain <name> is read into memory, and the high-confidence SNP positions between
      Strain and Strain2 are incorporated as full sequence and optionally as N-masking

The resulting .fa files are ready to be indexed with your favourite aligner. Proved and tested aligners include Bowtie2,
Tophat, STAR, Hisat2, HiCUP and Bismark. Please note that STAR and Hisat2 may require you to disable soft-clipping, please
refer to the SNPsplit manual for more details

Both the SNP filtering as well as the genome preparation write out little report files for record keeping.
Please note that the SNPsplit genome preparation writes out files and creates new folders for the SNPs and new genomes into
the current working directory, so move there before invoking SNPsplit_genome_preparation.


  USAGE:    SNPsplit_genome_preparation  [options] --vcf_file <file> --reference_genome /path/to/genome/ --strain <strain_name>


--vcf_file <file>             Mandatory file specifying SNP information for mouse strains from the Mouse Genomes Project 
							  (https://www.mousegenomes.org/): https://ftp.ebi.ac.uk/pub/databases/mousegenomes/REL-2112-v8-SNPs_Indels/
							  The file used and approved is called 'mgp_REL2021_snps.vcf.gz'. Please note that future versions
                              of this file or entirely different VCF files might not work out-of-the-box but may require some
                              tweaking. SNP calls are read from the VCF files, and high confidence SNPs are written into
                              a folder in the current working directory called SNPs_<strain_name>/chr<chromosome>.txt,
                              in the following format:

                                          SNP-ID     Chromosome  Position    Strand   Ref/SNP
                              example:   33941939        9       68878541       1       T/G

--v7_VCF                      This will use the file 'mgp_REL2005_snps_indels.vcf.gz' instead of the mgp v8 file mentioned
                              above, for backward compatibility reasons. This file contains both SNP and INDEL information,
							  but INDELs are skipped. NOTE: The v5 and v7 files work for the GRCm38 (now outdated) genome build!
							  
--strain <strain_name>        The strain you would like to use as SNP (ALT) genome. Mandatory. For an overview of strain names
                              just run SNPsplit_genome_preparation selecting '--list_strains'.

--list_strains                Displays a list of strain names present in the VCF file for use with '--strain <strain_name>'.

--dual_hybrid                 Optional. The resulting genome will no longer relate to the original reference specified with '--reference_genome'.
                              Instead the new Reference (Ref) is defined by '--strain <strain_name>' and the new SNP genome
                              is defined by '--strain2 <strain_name>'. '--dual_hybrid' automatically sets '--full_sequence'.

                              This will invoke a multi-step process:
                                 1) Read/filter SNPs for first strain (specified with '--strain <strain_name>')
                                 2) Write full SNP incorporated (and optionally N-masked) genome sequence for first strain
                                 3) Read/filter SNPs for second strain (specified with '--strain2 <strain_name>')
                                 4) Write full SNP incorporated (and optionally N-masked) genome sequence for second strain
                                 5) Generate new Ref/Alt SNP annotations for Strain1/Strain2
                                 6) Set first strain as new reference genome and construct full SNP incorporated (and optionally 
                                    N-masked) genome sequences for Strain1/Strain2
                                                            

--strain2 <strain_name>       Optional for constructing dual hybrid genomes (see '--dual_hybrid' for more information). For an
                              overview of strain names just run SNPsplit_genome_preparation selecting '--list_strains'.

--reference_genome            The path to the reference genome, typically the strain 'Black6' (C57BL/6J), e.g.
                              '--reference_genome /scratch/Genomes/Mouse/GRCm39/'. Expects one or more FastA files in this folder
                              (file extension: .fa or .fasta).

--skip_filtering              This option skips reading and filtering the VCF file. This assumes that a folder named
                              'SNPs_<Strain_Name>' exists in the working directory, and that text files with SNP information
                              are contained therein in the following format:

                                          SNP-ID     Chromosome  Position    Strand   Ref/SNP
                              example:   33941939        9       68878541       1       T/G

--nmasking                    Write out a genome version for the strain specified where Ref bases are replaced with 'N'. In the
                              Ref/SNP example T/G the N-masked genome would now carry an N instead of the T. The N-masked genome
                              is written to a folder called  '<strain_name>_N-masked/'. Default: ON.

--full_sequence               Write out a genome version for the strain specified where Ref bases are replaced with the SNP base.
                              In the Ref/SNP example T/G the full sequence genome would now carry a G instead of the T. The full
                              sequence genome is written out to folder called '<strain_name>_full_sequence/'. May be set in
                              addition to '--nmasking'. Default: OFF. 

--no_nmasking                 Disable N-masking if it is not desirable. Will automatically set '--full_sequence' instead.

--genome_build [name]         Name of the genome build incorporated into some of the output files. Defaults to 'GRCm39'.

--help                        Displays this help information and exits.

--version                     Displays version information and exits.


                                                             Last modified:  21 December 2022

EOF
    ;
  exit 1;
}