diff --git a/src/scripts/data/qtl_data/README.md b/src/scripts/data/qtl_data/README.md
new file mode 100644
index 0000000..8831abf
--- /dev/null
+++ b/src/scripts/data/qtl_data/README.md
@@ -0,0 +1,33 @@
+## QTL data processing
+
+The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.
+
+*Notes*:
+- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).
+
+
+As a prerequisite to generating any of the QTL datasets, run the following scripts (in order):
+1. download_finemap.py
+2. download_sumstat.py
+3. merge_finemapping_tables.py
+4. make_expression_tables.py
+
+
+To prepare the sQTL dataset, run these scripts:
+1. sqtl_make_positive_sets.py
+2. sqtl_make_negative_sets.py
+
+
+To prepare the paQTL dataset, run these scripts:
+1. paqtl_make_positive_sets.py
+2. paqtl_make_negative_sets.py
+
+
+To prepare the ipaQTL dataset, run these scripts:
+1. ipaqtl_make_positive_sets.py
+2. ipaqtl_make_negative_sets.py
+
+
+Finally, to generate the QTL VCF files, run this script:
+1. make_vcfs.py
+
diff --git a/src/scripts/data/qtl_data/download_finemap.py b/src/scripts/data/qtl_data/download_finemap.py
new file mode 100644
index 0000000..558e3ef
--- /dev/null
+++ b/src/scripts/data/qtl_data/download_finemap.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import pandas as pd
+
+import util
+
+'''
+download_finemap.py
+
+Download QTL Catalogue fine-mapping results.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ # read remote table
+ samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t')
+
+ # filter GTEx (for now)
+ samples_df = samples_df[samples_df.study == 'GTEx']
+
+
+ ################################################
+ # txrevise for splicing / polyA / TSS QTLs
+
+ os.makedirs('txrev', exist_ok=True)
+ txrev_df = samples_df[samples_df.quant_method == 'txrev']
+
+ jobs = []
+ for all_ftp_path in txrev_df.ftp_path:
+ # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz
+ # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets//Alasoo_2018_txrev_macrophage_IFNg+Salmonella.purity_filtered.txt.gz
+
+ all_ftp_file = all_ftp_path.split('/')[-1]
+ fine_ftp_file = all_ftp_file.replace('all.tsv', 'purity_filtered.txt')
+
+ fine_ftp_path = 'ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets/'
+ fine_ftp_path += fine_ftp_file
+
+ local_path = 'txrev/%s' % fine_ftp_file
+ if not os.path.isfile(local_path):
+ cmd = 'curl -o %s %s' % (local_path, fine_ftp_path)
+ jobs.append(cmd)
+
+ util.exec_par(jobs, 4, verbose=True)
+ # print('\n'.join(jobs))
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/download_sumstat.py b/src/scripts/data/qtl_data/download_sumstat.py
new file mode 100644
index 0000000..ca402df
--- /dev/null
+++ b/src/scripts/data/qtl_data/download_sumstat.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import pandas as pd
+
+import util
+
+'''
+download_sumstat.py
+
+Download QTL Catalogue sumstats.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ # read remote table
+ samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t')
+
+ # filter GTEx (for now)
+ samples_df = samples_df[samples_df.study == 'GTEx']
+
+
+ ################################################
+ # ge for sumstat (we want SNPs and possibly also base expression)
+
+ os.makedirs('ge', exist_ok=True)
+ txrev_df = samples_df[samples_df.quant_method == 'ge']
+
+ jobs = []
+ for all_ftp_path in txrev_df.ftp_path:
+ # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz
+
+ local_path = 'ge/%s' % all_ftp_path.split("/")[-1]
+
+ if not os.path.isfile(local_path):
+ cmd = 'curl -o %s %s' % (local_path, all_ftp_path)
+ jobs.append(cmd)
+
+ util.exec_par(jobs, 4, verbose=True)
+ # print('\n'.join(jobs))
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py b/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py
new file mode 100644
index 0000000..3f4d49d
--- /dev/null
+++ b/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_negative_sets.py
+
+Build tables with negative (non-causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ #Parameters
+ pip_cutoff = 0.01
+ max_distance = 10000
+ gene_pad = 50
+ apa_file = 'polyadb_intron.bed'
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+ finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz'
+
+ #Define tissues
+ tissue_names = [
+ 'adipose_subcutaneous',
+ 'adipose_visceral',
+ 'adrenal_gland',
+ 'artery_aorta',
+ 'artery_coronary',
+ 'artery_tibial',
+ 'blood',
+ 'brain_amygdala',
+ 'brain_anterior_cingulate_cortex',
+ 'brain_caudate',
+ 'brain_cerebellar_hemisphere',
+ 'brain_cerebellum',
+ 'brain_cortex',
+ 'brain_frontal_cortex',
+ 'brain_hippocampus',
+ 'brain_hypothalamus',
+ 'brain_nucleus_accumbens',
+ 'brain_putamen',
+ 'brain_spinal_cord',
+ 'brain_substantia_nigra',
+ 'breast',
+ 'colon_sigmoid',
+ 'colon_transverse',
+ 'esophagus_gej',
+ 'esophagus_mucosa',
+ 'esophagus_muscularis',
+ 'fibroblast',
+ 'heart_atrial_appendage',
+ 'heart_left_ventricle',
+ 'kidney_cortex',
+ 'LCL',
+ 'liver',
+ 'lung',
+ 'minor_salivary_gland',
+ 'muscle',
+ 'nerve_tibial',
+ 'ovary',
+ 'pancreas',
+ 'pituitary',
+ 'prostate',
+ 'skin_not_sun_exposed',
+ 'skin_sun_exposed',
+ 'small_intestine',
+ 'spleen',
+ 'stomach',
+ 'testis',
+ 'thyroid',
+ 'uterus',
+ 'vagina',
+ ]
+
+ #Compile negative SNP set for each tissue
+ for tissue_name in tissue_names :
+
+ print("-- " + str(tissue_name) + " --")
+
+ #Load summary stats and extract unique set of SNPs
+ vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+
+ #Only keep SNPs (no indels)
+ vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+ vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+ vcf_df['start'] = vcf_df['position'].astype(int)
+ vcf_df['end'] = vcf_df['start'] + 1
+ vcf_df['strand'] = "."
+
+ vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']]
+ vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+ print("len(vcf_df) = " + str(len(vcf_df)))
+
+ #Store intermediate SNPs
+ #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False)
+
+ #Load polyadenylation site annotation
+ apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+ apa_df['Start'] += 1
+
+ #Load gene span annotation
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+ gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+ gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+ gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+ gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+ gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+
+ #Join dataframes against gtf annotation
+ apa_pr = pr.PyRanges(apa_df)
+ gtf_pr = pr.PyRanges(gtf_df)
+ vcf_pr = pr.PyRanges(vcf_df)
+
+ apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+ vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+ apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+ vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True)
+
+ apa_gtf_df['Start'] -= max_distance
+ apa_gtf_df['End'] += max_distance
+
+ #Join vcf against polyadenylation annotation
+ apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+ vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+ vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+
+ #Force gene_id of SNP to be same as the gene_id of the polyA site
+ vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+ vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']]
+
+ #PolyA site position
+ vcf_apa_df['Start_b'] += max_distance
+ vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+ #Distance to polyA site
+ vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+
+ #Choose unique SNPs by shortest distance to polyA site
+ vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+ vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+ vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+ vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']]
+
+ print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+
+ #Store intermediate SNPs (filtered)
+ vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', index=False)
+
+ #Reload filtered SNP file
+ vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', compression='gzip')
+
+ #Create variant identifier
+ vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt']
+
+ #Load merged fine-mapping dataframe
+ finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']]
+
+ #Join against fine-mapping dataframe
+ neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left')
+ neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0.
+
+ #Only keep SNPs with PIP < cutoff
+ neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+ #Store final table of negative SNPs
+ neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t', index=False)
+
+ print("len(neg_df) = " + str(len(neg_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py b/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py
new file mode 100644
index 0000000..f1afb7b
--- /dev/null
+++ b/src/scripts/data/qtl_data/ipaqtl_make_positive_sets.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_positive_sets.py
+
+Build tables with positive (causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ #Parameters
+ pip_cutoff = 0.01
+ max_distance = 10000
+ gene_pad = 50
+ apa_file = 'polyadb_intron.bed'
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+
+ #Define tissues
+ tissue_names = [
+ 'adipose_subcutaneous',
+ 'adipose_visceral',
+ 'adrenal_gland',
+ 'artery_aorta',
+ 'artery_coronary',
+ 'artery_tibial',
+ 'blood',
+ 'brain_amygdala',
+ 'brain_anterior_cingulate_cortex',
+ 'brain_caudate',
+ 'brain_cerebellar_hemisphere',
+ 'brain_cerebellum',
+ 'brain_cortex',
+ 'brain_frontal_cortex',
+ 'brain_hippocampus',
+ 'brain_hypothalamus',
+ 'brain_nucleus_accumbens',
+ 'brain_putamen',
+ 'brain_spinal_cord',
+ 'brain_substantia_nigra',
+ 'breast',
+ 'colon_sigmoid',
+ 'colon_transverse',
+ 'esophagus_gej',
+ 'esophagus_mucosa',
+ 'esophagus_muscularis',
+ 'fibroblast',
+ 'heart_atrial_appendage',
+ 'heart_left_ventricle',
+ 'kidney_cortex',
+ 'LCL',
+ 'liver',
+ 'lung',
+ 'minor_salivary_gland',
+ 'muscle',
+ 'nerve_tibial',
+ 'ovary',
+ 'pancreas',
+ 'pituitary',
+ 'prostate',
+ 'skin_not_sun_exposed',
+ 'skin_sun_exposed',
+ 'small_intestine',
+ 'spleen',
+ 'stomach',
+ 'testis',
+ 'thyroid',
+ 'uterus',
+ 'vagina',
+ ]
+
+ #Compile positive SNP set for each tissue
+ for tissue_name in tissue_names :
+
+ print("-- " + str(tissue_name) + " --")
+
+ #Load fine-mapping table
+ vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False)
+
+ #Only keep SNPs (no indels)
+ vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+ #Only keep SNPs associated with polyadenylation events
+ vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".downstream.")].copy().reset_index(drop=True)
+
+ vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+ vcf_df['start'] = vcf_df['position'].astype(int)
+ vcf_df['end'] = vcf_df['start'] + 1
+ vcf_df['strand'] = "."
+
+ vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']]
+ vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+ print("len(vcf_df) = " + str(len(vcf_df)))
+
+ #Load polyadenylation site annotation
+ apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+ apa_df['Start'] += 1
+
+ #Load gene span annotation
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+ gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+ gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+ gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+ gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+ gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+
+ #Join dataframes against gtf annotation
+ apa_pr = pr.PyRanges(apa_df)
+ gtf_pr = pr.PyRanges(gtf_df)
+ vcf_pr = pr.PyRanges(vcf_df)
+
+ apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+ vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+ apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+ vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True)
+
+ apa_gtf_df['Start'] -= max_distance
+ apa_gtf_df['End'] += max_distance
+
+ #Join vcf against polyadenylation annotation
+ apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+ vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+ vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+
+ #Force gene_id of SNP to be same as the gene_id of the polyA site
+ vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+ vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']]
+
+ #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait
+ vcf_apa_df['molecular_trait_gene_id'] = vcf_apa_df['molecular_trait_id'].apply(lambda x: x.split(".")[0])
+ vcf_apa_df = vcf_apa_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True)
+
+ #PolyA site position
+ vcf_apa_df['Start_b'] += max_distance
+ vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+ #Distance to polyA site
+ vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+
+ #Choose unique SNPs by shortest distance to polyA site (and inverse PIP for tie-breaking)
+ vcf_apa_df['pip_inv'] = 1. - vcf_apa_df['pip']
+
+ vcf_apa_df = vcf_apa_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+ vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+ vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+ vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']]
+
+ print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+
+ #Store intermediate SNPs (filtered)
+ vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_finemapped_filtered.bed.gz", sep='\t', index=False)
+
+ #Reload filtered SNP file
+ vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_finemapped_filtered.bed.gz", sep='\t', compression='gzip')
+
+ #Only keep SNPs with PIP > cutoff
+ pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+ #Store final table of positive SNPs
+ pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_positives.bed.gz", sep='\t', index=False)
+
+ print("len(pos_df) = " + str(len(pos_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/ipaqtl_vcfs.py b/src/scripts/data/qtl_data/ipaqtl_vcfs.py
new file mode 100644
index 0000000..773c45e
--- /dev/null
+++ b/src/scripts/data/qtl_data/ipaqtl_vcfs.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+import os
+import pdb
+import time
+
+import numpy as np
+import pandas as pd
+import pyranges as pr
+from tqdm import tqdm
+
+'''
+ipaqtl_vcfs.py
+
+Generate positive and negative intronic paQTL sets from the QTL catalog txrevise.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options]'
+ parser = OptionParser(usage)
+ parser.add_option('--neg_pip', dest='neg_pip',
+ default=0.01, type='float',
+ help='PIP upper limit for negative examples. [Default: %default]')
+ parser.add_option('--pos_pip', dest='pos_pip',
+ default=0.9, type='float',
+ help='PIP lower limit for positive examples. [Default: %default]')
+ parser.add_option('--match_gene', dest='match_gene',
+ default=0, type='int',
+ help='Try finding negative in same gene as positive. [Default: %default]')
+ parser.add_option('--match_allele', dest='match_allele',
+ default=0, type='int',
+ help='Try finding negative with same ref and alt alleles. [Default: %default]')
+ parser.add_option('-o', dest='out_prefix',
+ default='qtlcat_ipaqtl')
+ (options,args) = parser.parse_args()
+
+ tissue_name = options.out_prefix.split('txrev_')[1]
+
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf'
+
+ # read variant table
+ qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t')
+ qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_intronic_polya_positives.bed.gz", sep='\t')
+
+ # read TPM bin table and construct lookup dictionaries
+ tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']]
+ gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index')
+
+ # filter on SNPs with genes in TPM bin dict
+ qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+ qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+
+ #Load gene span annotation (protein-coding/categorized only)
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str'])
+ gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist()
+
+ # filter on SNPs with genes in GTF file
+ qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+ qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+
+ bin_to_genes_dict = {}
+ for _, row in tpm_df.iterrows() :
+
+ if row['bin_index'] not in bin_to_genes_dict :
+ bin_to_genes_dict[row['bin_index']] = []
+
+ bin_to_genes_dict[row['bin_index']].append(row['gene_id'])
+
+ for sample_bin in bin_to_genes_dict :
+ bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin])
+
+ # split molecular trait id and filter for polyadenylation (for positives)
+ qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id]
+ qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id]
+
+ qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'downstream']
+ qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'pas_dist'})
+
+ qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.downstream.negative"
+ qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id']
+ qtlcat_df_neg['event'] = 'downstream'
+ qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'pas_dist'})
+
+ paqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True)
+
+ # determine positive variants
+ paqtl_pos_df = paqtl_df[paqtl_df.pip >= options.pos_pip]
+ paqtl_neg_df = paqtl_df[paqtl_df.pip < options.neg_pip]
+ pos_variants = set(paqtl_pos_df.variant)
+
+ neg_gene_and_allele_variants = 0
+ neg_gene_variants = 0
+
+ neg_expr_and_allele_variants = 0
+ neg_expr_variants = 0
+
+ unmatched_variants = 0
+
+ # choose negative variants
+ neg_variants = set()
+ neg_dict = {}
+ for pvariant in tqdm(pos_variants):
+ paqtl_this_df = paqtl_pos_df[paqtl_pos_df.variant == pvariant]
+
+ neg_found = False
+
+ # optionally prefer negative from positive's gene set
+ if options.match_gene == 1 and options.match_allele == 1 :
+ pgenes = set(paqtl_this_df.gene)
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+
+ if neg_found :
+ neg_gene_and_allele_variants += 1
+
+ if not neg_found and options.match_gene == 1 :
+ pgenes = set(paqtl_this_df.gene)
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+
+ if neg_found :
+ neg_gene_variants += 1
+
+ if not neg_found and options.match_allele == 1 :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+
+ if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+
+ if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+
+ if neg_found :
+ neg_expr_and_allele_variants += 1
+
+ if not neg_found :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+
+ if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+
+ if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+
+ if neg_found :
+ neg_expr_variants += 1
+
+ if not neg_found :
+ print("[Warning] Could not find a matching negative for '" + pvariant + "'")
+ unmatched_variants += 1
+
+ print('%d positive variants' % len(pos_variants))
+ print('%d negative variants' % len(neg_variants))
+ print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants)
+ print(' - %d gene-matched negatives ' % neg_gene_variants)
+ print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants)
+ print(' - %d expr-matched negatives ' % neg_expr_variants)
+ print(' - %d unmatched negatives ' % unmatched_variants)
+
+ pos_dict = {pv: pv for pv in pos_variants}
+
+ # write VCFs
+ write_vcf('%s_pos.vcf' % options.out_prefix, paqtl_df, pos_variants, pos_dict)
+ write_vcf('%s_neg.vcf' % options.out_prefix, paqtl_df, neg_variants, neg_dict)
+
+def find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, match_allele) :
+
+ gene_mask = np.array([gene in pgenes for gene in paqtl_neg_df.gene])
+ paqtl_neg_gene_df = paqtl_neg_df[gene_mask]
+
+ # match PAS distance
+ this_dist = paqtl_this_df.iloc[0].pas_dist
+ dist_cmp = np.abs(paqtl_neg_gene_df.pas_dist - this_dist)
+ dist_cmp_unique = np.sort(np.unique(dist_cmp.values))
+
+ this_ref = paqtl_this_df.iloc[0].ref
+ this_alt = paqtl_this_df.iloc[0].alt
+
+ for ni_unique in dist_cmp_unique:
+
+ paqtl_neg_gene_dist_df = paqtl_neg_gene_df.loc[dist_cmp == ni_unique]
+
+ shuffle_index = np.arange(len(paqtl_neg_gene_dist_df), dtype='int32')
+ np.random.shuffle(shuffle_index)
+
+ for npaqtl_i in range(len(paqtl_neg_gene_dist_df)) :
+ npaqtl = paqtl_neg_gene_dist_df.iloc[shuffle_index[npaqtl_i]]
+
+ if not match_allele or (npaqtl.ref == this_ref and npaqtl.alt == this_alt):
+ if npaqtl.variant not in neg_variants and npaqtl.variant not in pos_variants:
+
+ neg_variants.add(npaqtl.variant)
+ neg_dict[npaqtl.variant] = paqtl_this_df.iloc[0].variant
+
+ return True
+
+ return False
+
+def write_vcf(vcf_file, df, variants_write, variants_dict):
+ vcf_open = open(vcf_file, 'w')
+ print('##fileformat=VCFv4.2', file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
+ print('\t'.join(cols), file=vcf_open)
+
+ variants_written = set()
+
+ for v in df.itertuples():
+ if v.variant in variants_write and v.variant not in variants_written:
+ cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.']
+ cols += ['MT=%s;PD=%d;PI=%s' % (v.molecular_trait_id, v.pas_dist, variants_dict[v.variant])]
+ print('\t'.join(cols), file=vcf_open)
+ variants_written.add(v.variant)
+
+ vcf_open.close()
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/make_expression_tables.py b/src/scripts/data/qtl_data/make_expression_tables.py
new file mode 100644
index 0000000..ddc2a63
--- /dev/null
+++ b/src/scripts/data/qtl_data/make_expression_tables.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+import matplotlib.pyplot as plt
+
+'''
+make_expression_tables.py
+
+Contruct TPM bucket to sample genes from.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ #Define tissue column-to-file mapping
+ tissue_dict = {
+ 'Adipose - Subcutaneous' : 'adipose_subcutaneous',
+ 'Adipose - Visceral (Omentum)' : 'adipose_visceral',
+ 'Adrenal Gland' : 'adrenal_gland',
+ 'Artery - Aorta' : 'artery_aorta',
+ 'Artery - Coronary' : 'artery_coronary',
+ 'Artery - Tibial' : 'artery_tibial',
+ 'Whole Blood' : 'blood',
+ 'Brain - Amygdala' : 'brain_amygdala',
+ 'Brain - Anterior cingulate cortex (BA24)' : 'brain_anterior_cingulate_cortex',
+ 'Brain - Caudate (basal ganglia)' : 'brain_caudate',
+ 'Brain - Cerebellar Hemisphere' : 'brain_cerebellar_hemisphere',
+ 'Brain - Cerebellum' : 'brain_cerebellum',
+ 'Brain - Cortex' : 'brain_cortex',
+ 'Brain - Frontal Cortex (BA9)' : 'brain_frontal_cortex',
+ 'Brain - Hippocampus' : 'brain_hippocampus',
+ 'Brain - Hypothalamus' : 'brain_hypothalamus',
+ 'Brain - Nucleus accumbens (basal ganglia)' : 'brain_nucleus_accumbens',
+ 'Brain - Putamen (basal ganglia)' : 'brain_putamen',
+ 'Brain - Spinal cord (cervical c-1)' : 'brain_spinal_cord',
+ 'Brain - Substantia nigra' : 'brain_substantia_nigra',
+ 'Breast - Mammary Tissue' : 'breast',
+ 'Colon - Sigmoid' : 'colon_sigmoid',
+ 'Colon - Transverse' : 'colon_transverse',
+ 'Esophagus - Gastroesophageal Junction' : 'esophagus_gej',
+ 'Esophagus - Mucosa' : 'esophagus_mucosa',
+ 'Esophagus - Muscularis' : 'esophagus_muscularis',
+ 'Cells - Cultured fibroblasts' : 'fibroblast',
+ 'Heart - Atrial Appendage' : 'heart_atrial_appendage',
+ 'Heart - Left Ventricle' : 'heart_left_ventricle',
+ 'Kidney - Cortex' : 'kidney_cortex',
+ 'Cells - EBV-transformed lymphocytes' : 'LCL',
+ 'Liver' : 'liver',
+ 'Lung' : 'lung',
+ 'Minor Salivary Gland' : 'minor_salivary_gland',
+ 'Muscle - Skeletal' : 'muscle',
+ 'Nerve - Tibial' : 'nerve_tibial',
+ 'Ovary' : 'ovary',
+ 'Pancreas' : 'pancreas',
+ 'Pituitary' : 'pituitary',
+ 'Prostate' : 'prostate',
+ 'Skin - Not Sun Exposed (Suprapubic)' : 'skin_not_sun_exposed',
+ 'Skin - Sun Exposed (Lower leg)' : 'skin_sun_exposed',
+ 'Small Intestine - Terminal Ileum' : 'small_intestine',
+ 'Spleen' : 'spleen',
+ 'Stomach' : 'stomach',
+ 'Testis' : 'testis',
+ 'Thyroid' : 'thyroid',
+ 'Uterus' : 'uterus',
+ 'Vagina' : 'vagina',
+ }
+
+ for tissue_name in tissue_dict :
+
+ #Load TPM matrix
+ tpm_df = pd.read_csv("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz", sep='\t', compression='gzip', skiprows=2)
+
+ save_name = tissue_dict[tissue_name]
+
+ print("-- " + save_name + " --")
+
+ #Clean dataframe
+ tpm_df['gene_id'] = tpm_df['Name'].apply(lambda x: x.split(".")[0])
+
+ tpm_df = tpm_df.drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+ tpm_df['tpm'] = tpm_df[tissue_name]
+ tpm_df = tpm_df[['gene_id', 'tpm']]
+
+ #Get non-zero TPM entries
+ tpm_df_zero = tpm_df.loc[tpm_df['tpm'] == 0].copy().reset_index(drop=True)
+ tpm_df_nonzero = tpm_df.loc[tpm_df['tpm'] > 0].copy().reset_index(drop=True)
+
+ tpm_df_zero['tpm_log2'] = 0.
+ tpm_df_nonzero['tpm_log2'] = np.log2(tpm_df_nonzero['tpm'])
+
+ #Clip at extremes
+ min_q = 0.0075
+ max_q = 0.9925
+
+ #Log2 fold change bin sizes
+ bin_size = 0.4
+ bin_offset = 0.15
+
+ min_tpm_log2 = np.quantile(tpm_df_nonzero['tpm_log2'], q=min_q)
+ max_tpm_log2 = np.quantile(tpm_df_nonzero['tpm_log2'], q=max_q)
+
+ tpm_df_nonzero.loc[tpm_df_nonzero['tpm_log2'] < min_tpm_log2, 'tpm_log2'] = min_tpm_log2
+ tpm_df_nonzero.loc[tpm_df_nonzero['tpm_log2'] > max_tpm_log2, 'tpm_log2'] = max_tpm_log2
+
+ tpm_log2 = tpm_df_nonzero['tpm_log2'].values
+
+ n_bins = int((max_tpm_log2 - min_tpm_log2) / bin_size)
+
+ #Get sample bins
+ sample_bins = np.linspace(min_tpm_log2, max_tpm_log2, n_bins+1)
+
+ #Map values to bins
+ bin_index = np.digitize(tpm_log2, sample_bins[1:], right=True)
+ bin_index_l = np.digitize(tpm_log2 - bin_offset, sample_bins[1:], right=True)
+ bin_index_r = np.digitize(tpm_log2 + bin_offset, sample_bins[1:], right=True)
+
+ tpm_df_zero['bin_index_l'] = -1 * np.ones(len(tpm_df_zero), dtype='int32')
+ tpm_df_zero['bin_index'] = -1 * np.ones(len(tpm_df_zero), dtype='int32')
+ tpm_df_zero['bin_index_r'] = -1 * np.ones(len(tpm_df_zero), dtype='int32')
+
+ tpm_df_nonzero['bin_index_l'] = bin_index_l
+ tpm_df_nonzero['bin_index'] = bin_index
+ tpm_df_nonzero['bin_index_r'] = bin_index_r
+
+ tpm_df = pd.concat([tpm_df_zero, tpm_df_nonzero]).copy().reset_index(drop=True)
+
+ tpm_df = tpm_df.sort_values(by='gene_id', ascending=True).copy().reset_index(drop=True)
+
+ #Save dataframe
+ tpm_df.to_csv('ge/GTEx_ge_' + save_name + "_tpms.csv", sep='\t', index=False)
+
+ #Visualize TPM sample bins
+ tpm_df_filtered = tpm_df.loc[tpm_df['tpm'] > 0.]
+
+ f = plt.figure(figsize=(4, 3))
+
+ plt.hist(tpm_df_filtered['bin_index'].values, bins=np.unique(tpm_df_filtered['bin_index'].values))
+
+ plt.xlim(0, np.max(tpm_df_filtered['bin_index'].values))
+
+ plt.xticks(fontsize=8)
+ plt.yticks(fontsize=8)
+
+ plt.xlabel("Sample bin (FC < " + str(round(2**(bin_size+2*bin_offset), 2)) + ")", fontsize=8)
+ plt.ylabel("# of genes", fontsize=8)
+
+ plt.title("TPM sample bins (" + save_name + ")", fontsize=8)
+
+ plt.tight_layout()
+
+ plt.savefig('ge/GTEx_ge_' + save_name + "_tpms.png", transparent=False, dpi=300)
+
+ plt.close()
+
+ #Check and warn in case of low-support bins
+ _, bin_support = np.unique(tpm_df_filtered['bin_index'].values, return_counts=True)
+
+ if np.any(bin_support < 100) :
+ print("[Warning] Less than 100 genes in some of the TPM sample bins (min = " + str(int(np.min(bin_support))) + ").")
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/make_vcfs.py b/src/scripts/data/qtl_data/make_vcfs.py
new file mode 100644
index 0000000..aa251d0
--- /dev/null
+++ b/src/scripts/data/qtl_data/make_vcfs.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import glob
+import os
+
+import pandas as pd
+
+import util
+
+'''
+make_vcfs.py
+
+Download QTL Catalogue fine-mapping results.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ pip = 0.2
+ match_gene = 0
+ match_allele = 1
+
+ ################################################
+ # intronic polyA QTLs
+
+ out_dir = 'ipaqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '')
+ os.makedirs(out_dir, exist_ok=True)
+
+ jobs = []
+ for table_file in glob.glob('txrev/*.txt.gz'):
+ out_prefix = table_file.replace('txrev/', '%s/' % out_dir)
+ out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '')
+ cmd = './ipaqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix)
+ jobs.append(cmd)
+ util.exec_par(jobs, 6, verbose=True)
+
+ # merge study/tissue variants
+ mpos_vcf_file = '%s/pos_merge.vcf' % out_dir
+ mneg_vcf_file = '%s/neg_merge.vcf' % out_dir
+ merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir)
+ merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir)
+
+
+ ################################################
+ # polyA QTLs
+
+ out_dir = 'paqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '')
+ os.makedirs(out_dir, exist_ok=True)
+
+ jobs = []
+ for table_file in glob.glob('txrev/*.txt.gz'):
+ out_prefix = table_file.replace('txrev/', '%s/' % out_dir)
+ out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '')
+ cmd = './paqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix)
+ jobs.append(cmd)
+ util.exec_par(jobs, 6, verbose=True)
+
+ # merge study/tissue variants
+ mpos_vcf_file = '%s/pos_merge.vcf' % out_dir
+ mneg_vcf_file = '%s/neg_merge.vcf' % out_dir
+ merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir)
+ merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir)
+
+ ################################################
+ # splicing QTLs
+
+ out_dir = 'sqtl_pip%d%s%s' % (pip*100, 'g' if match_gene == 1 else 'e', 'a' if match_allele else '')
+ os.makedirs(out_dir, exist_ok=True)
+
+ jobs = []
+ for table_file in glob.glob('txrev/*.txt.gz'):
+ out_prefix = table_file.replace('txrev/', '%s/' % out_dir)
+ out_prefix = out_prefix.replace('.purity_filtered.txt.gz', '')
+ cmd = './sqtl_vcfs.py --neg_pip 0.01 --pos_pip %f --match_gene %d --match_allele %d -o %s' % (pip, match_gene, match_allele, out_prefix)
+ jobs.append(cmd)
+ util.exec_par(jobs, 6, verbose=True)
+
+ # merge study/tissue variants
+ mpos_vcf_file = '%s/pos_merge.vcf' % out_dir
+ mneg_vcf_file = '%s/neg_merge.vcf' % out_dir
+ merge_variants(mpos_vcf_file, '%s/*_pos.vcf' % out_dir)
+ merge_variants(mneg_vcf_file, '%s/*_neg.vcf' % out_dir)
+
+
+def merge_variants(merge_vcf_file, vcf_glob):
+ with open(merge_vcf_file, 'w') as merge_vcf_open:
+ vcf0_file = list(glob.glob(vcf_glob))[0]
+ for line in open(vcf0_file):
+ if line[0] == '#':
+ print(line, end='', file=merge_vcf_open)
+
+ merged_variants = set()
+ for vcf_file in glob.glob(vcf_glob):
+ for line in open(vcf_file):
+ if not line.startswith('#'):
+ variant = line.split()[2]
+ if variant not in merged_variants:
+ print(line, file=merge_vcf_open, end='')
+ merged_variants.add(variant)
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/merge_finemapping_tables.py b/src/scripts/data/qtl_data/merge_finemapping_tables.py
new file mode 100644
index 0000000..ac4fa7d
--- /dev/null
+++ b/src/scripts/data/qtl_data/merge_finemapping_tables.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+'''
+merge_finemapping_tables.py
+
+Merge fine-mapping tables of QTL credible sets.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ #Define tissues
+ tissue_names = [
+ 'adipose_subcutaneous',
+ 'adipose_visceral',
+ 'adrenal_gland',
+ 'artery_aorta',
+ 'artery_coronary',
+ 'artery_tibial',
+ 'blood',
+ 'brain_amygdala',
+ 'brain_anterior_cingulate_cortex',
+ 'brain_caudate',
+ 'brain_cerebellar_hemisphere',
+ 'brain_cerebellum',
+ 'brain_cortex',
+ 'brain_frontal_cortex',
+ 'brain_hippocampus',
+ 'brain_hypothalamus',
+ 'brain_nucleus_accumbens',
+ 'brain_putamen',
+ 'brain_spinal_cord',
+ 'brain_substantia_nigra',
+ 'breast',
+ 'colon_sigmoid',
+ 'colon_transverse',
+ 'esophagus_gej',
+ 'esophagus_mucosa',
+ 'esophagus_muscularis',
+ 'fibroblast',
+ 'heart_atrial_appendage',
+ 'heart_left_ventricle',
+ 'kidney_cortex',
+ 'LCL',
+ 'liver',
+ 'lung',
+ 'minor_salivary_gland',
+ 'muscle',
+ 'nerve_tibial',
+ 'ovary',
+ 'pancreas',
+ 'pituitary',
+ 'prostate',
+ 'skin_not_sun_exposed',
+ 'skin_sun_exposed',
+ 'small_intestine',
+ 'spleen',
+ 'stomach',
+ 'testis',
+ 'thyroid',
+ 'uterus',
+ 'vagina',
+ ]
+
+ #Load and merge fine-mapping results
+ dfs = []
+ for tissue_name in tissue_names :
+
+ print("-- " + tissue_name + " --")
+
+ df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip'], low_memory=False)
+ dfs.append(df.sort_values(by='pip', ascending=False).drop_duplicates(subset=['variant'], keep='first').copy().reset_index(drop=True))
+
+ df = pd.concat(dfs).sort_values(by='pip', ascending=False).drop_duplicates(subset=['variant'], keep='first').copy().reset_index(drop=True)
+
+ df['chromosome'] = "chr" + df['chromosome'].astype(str)
+ df = df.rename(columns={'chromosome' : 'chrom', 'position' : 'pos'})
+
+ print("len(df) = " + str(len(df)))
+
+ #Save union of dataframes
+ df.to_csv("txrev/GTEx_txrev_finemapped_merged.csv.gz", sep='\t', index=False)
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/paqtl_make_negative_sets.py b/src/scripts/data/qtl_data/paqtl_make_negative_sets.py
new file mode 100644
index 0000000..a5da60d
--- /dev/null
+++ b/src/scripts/data/qtl_data/paqtl_make_negative_sets.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_negative_sets.py
+
+Build tables with negative (non-causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ #Parameters
+ pip_cutoff = 0.01
+ max_distance = 10000
+ gene_pad = 50
+ apa_file = '/home/drk/common/data/genomes/hg38/genes/polyadb/polyadb_exon3.bed'
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+ finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz'
+
+ #Define tissues
+ tissue_names = [
+ 'adipose_subcutaneous',
+ 'adipose_visceral',
+ 'adrenal_gland',
+ 'artery_aorta',
+ 'artery_coronary',
+ 'artery_tibial',
+ 'blood',
+ 'brain_amygdala',
+ 'brain_anterior_cingulate_cortex',
+ 'brain_caudate',
+ 'brain_cerebellar_hemisphere',
+ 'brain_cerebellum',
+ 'brain_cortex',
+ 'brain_frontal_cortex',
+ 'brain_hippocampus',
+ 'brain_hypothalamus',
+ 'brain_nucleus_accumbens',
+ 'brain_putamen',
+ 'brain_spinal_cord',
+ 'brain_substantia_nigra',
+ 'breast',
+ 'colon_sigmoid',
+ 'colon_transverse',
+ 'esophagus_gej',
+ 'esophagus_mucosa',
+ 'esophagus_muscularis',
+ 'fibroblast',
+ 'heart_atrial_appendage',
+ 'heart_left_ventricle',
+ 'kidney_cortex',
+ 'LCL',
+ 'liver',
+ 'lung',
+ 'minor_salivary_gland',
+ 'muscle',
+ 'nerve_tibial',
+ 'ovary',
+ 'pancreas',
+ 'pituitary',
+ 'prostate',
+ 'skin_not_sun_exposed',
+ 'skin_sun_exposed',
+ 'small_intestine',
+ 'spleen',
+ 'stomach',
+ 'testis',
+ 'thyroid',
+ 'uterus',
+ 'vagina',
+ ]
+
+ #Compile negative SNP set for each tissue
+ for tissue_name in tissue_names :
+
+ print("-- " + str(tissue_name) + " --")
+
+ #Load summary stats and extract unique set of SNPs
+ vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+
+ #Only keep SNPs (no indels)
+ vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+ vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+ vcf_df['start'] = vcf_df['position'].astype(int)
+ vcf_df['end'] = vcf_df['start'] + 1
+ vcf_df['strand'] = "."
+
+ vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']]
+ vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+ print("len(vcf_df) = " + str(len(vcf_df)))
+
+ #Store intermediate SNPs
+ #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False)
+
+ #Load polyadenylation site annotation
+ apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+ apa_df['Start'] += 1
+
+ #Load gene span annotation
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+ gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+ gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+ gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+ gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+ gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+
+ #Join dataframes against gtf annotation
+ apa_pr = pr.PyRanges(apa_df)
+ gtf_pr = pr.PyRanges(gtf_df)
+ vcf_pr = pr.PyRanges(vcf_df)
+
+ apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+ vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+ apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+ vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True)
+
+ apa_gtf_df['Start'] -= max_distance
+ apa_gtf_df['End'] += max_distance
+
+ #Join vcf against polyadenylation annotation
+ apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+ vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+ vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+
+ #Force gene_id of SNP to be same as the gene_id of the polyA site
+ vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+ vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']]
+
+ #PolyA site position
+ vcf_apa_df['Start_b'] += max_distance
+ vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+ #Distance to polyA site
+ vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+
+ #Choose unique SNPs by shortest distance to polyA site
+ vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+ vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+ vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+ vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']]
+
+ print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+
+ #Store intermediate SNPs (filtered)
+ vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_polya_filtered.bed.gz", sep='\t', index=False)
+
+ #Reload filtered SNP file
+ vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_polya_filtered.bed.gz", sep='\t', compression='gzip')
+
+ #Create variant identifier
+ vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt']
+
+ #Load merged fine-mapping dataframe
+ finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']]
+
+ #Join against fine-mapping dataframe
+ neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left')
+ neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0.
+
+ #Only keep SNPs with PIP < cutoff
+ neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+ #Store final table of negative SNPs
+ neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_polya_negatives.bed.gz", sep='\t', index=False)
+
+ print("len(neg_df) = " + str(len(neg_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/paqtl_make_positive_sets.py b/src/scripts/data/qtl_data/paqtl_make_positive_sets.py
new file mode 100644
index 0000000..3d07fa3
--- /dev/null
+++ b/src/scripts/data/qtl_data/paqtl_make_positive_sets.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_positive_sets.py
+
+Build tables with positive (causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ #Parameters
+ pip_cutoff = 0.01
+ max_distance = 10000
+ gene_pad = 50
+ apa_file = '/home/drk/common/data/genomes/hg38/genes/polyadb/polyadb_exon3.bed'
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+
+ #Define tissues
+ tissue_names = [
+ 'adipose_subcutaneous',
+ 'adipose_visceral',
+ 'adrenal_gland',
+ 'artery_aorta',
+ 'artery_coronary',
+ 'artery_tibial',
+ 'blood',
+ 'brain_amygdala',
+ 'brain_anterior_cingulate_cortex',
+ 'brain_caudate',
+ 'brain_cerebellar_hemisphere',
+ 'brain_cerebellum',
+ 'brain_cortex',
+ 'brain_frontal_cortex',
+ 'brain_hippocampus',
+ 'brain_hypothalamus',
+ 'brain_nucleus_accumbens',
+ 'brain_putamen',
+ 'brain_spinal_cord',
+ 'brain_substantia_nigra',
+ 'breast',
+ 'colon_sigmoid',
+ 'colon_transverse',
+ 'esophagus_gej',
+ 'esophagus_mucosa',
+ 'esophagus_muscularis',
+ 'fibroblast',
+ 'heart_atrial_appendage',
+ 'heart_left_ventricle',
+ 'kidney_cortex',
+ 'LCL',
+ 'liver',
+ 'lung',
+ 'minor_salivary_gland',
+ 'muscle',
+ 'nerve_tibial',
+ 'ovary',
+ 'pancreas',
+ 'pituitary',
+ 'prostate',
+ 'skin_not_sun_exposed',
+ 'skin_sun_exposed',
+ 'small_intestine',
+ 'spleen',
+ 'stomach',
+ 'testis',
+ 'thyroid',
+ 'uterus',
+ 'vagina',
+ ]
+
+ #Compile positive SNP set for each tissue
+ for tissue_name in tissue_names :
+
+ print("-- " + str(tissue_name) + " --")
+
+ #Load fine-mapping table
+ vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False)
+
+ #Only keep SNPs (no indels)
+ vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+ #Only keep SNPs associated with polyadenylation events
+ vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".downstream.")].copy().reset_index(drop=True)
+
+ vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+ vcf_df['start'] = vcf_df['position'].astype(int)
+ vcf_df['end'] = vcf_df['start'] + 1
+ vcf_df['strand'] = "."
+
+ vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']]
+ vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+ print("len(vcf_df) = " + str(len(vcf_df)))
+
+ #Load polyadenylation site annotation
+ apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+ apa_df['Start'] += 1
+
+ #Load gene span annotation
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+ gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+ gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+ gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+ gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+ gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+
+ #Join dataframes against gtf annotation
+ apa_pr = pr.PyRanges(apa_df)
+ gtf_pr = pr.PyRanges(gtf_df)
+ vcf_pr = pr.PyRanges(vcf_df)
+
+ apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+ vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+ apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+ vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True)
+
+ apa_gtf_df['Start'] -= max_distance
+ apa_gtf_df['End'] += max_distance
+
+ #Join vcf against polyadenylation annotation
+ apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+ vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+ vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+
+ #Force gene_id of SNP to be same as the gene_id of the polyA site
+ vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+ vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']]
+
+ #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait
+ vcf_apa_df['molecular_trait_gene_id'] = vcf_apa_df['molecular_trait_id'].apply(lambda x: x.split(".")[0])
+ vcf_apa_df = vcf_apa_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True)
+
+ #PolyA site position
+ vcf_apa_df['Start_b'] += max_distance
+ vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+ #Distance to polyA site
+ vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+
+ #Choose unique SNPs by shortest distance to polyA site (and inverse PIP for tie-breaking)
+ vcf_apa_df['pip_inv'] = 1. - vcf_apa_df['pip']
+
+ vcf_apa_df = vcf_apa_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+ vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+ vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+ vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']]
+
+ print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+
+ #Store intermediate SNPs (filtered)
+ vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_polya_finemapped_filtered.bed.gz", sep='\t', index=False)
+
+ #Reload filtered SNP file
+ vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_polya_finemapped_filtered.bed.gz", sep='\t', compression='gzip')
+
+ #Only keep SNPs with PIP > cutoff
+ pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+ #Store final table of positive SNPs
+ pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_polya_positives.bed.gz", sep='\t', index=False)
+
+ print("len(pos_df) = " + str(len(pos_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/paqtl_vcfs.py b/src/scripts/data/qtl_data/paqtl_vcfs.py
new file mode 100644
index 0000000..f0884b1
--- /dev/null
+++ b/src/scripts/data/qtl_data/paqtl_vcfs.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+import os
+import pdb
+import time
+
+import numpy as np
+import pandas as pd
+import pyranges as pr
+from tqdm import tqdm
+
+'''
+paqtl_vcfs.py
+
+Generate positive and negative paQTL sets from the QTL catalog txrevise.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options]'
+ parser = OptionParser(usage)
+ parser.add_option('--neg_pip', dest='neg_pip',
+ default=0.01, type='float',
+ help='PIP upper limit for negative examples. [Default: %default]')
+ parser.add_option('--pos_pip', dest='pos_pip',
+ default=0.9, type='float',
+ help='PIP lower limit for positive examples. [Default: %default]')
+ parser.add_option('--match_gene', dest='match_gene',
+ default=0, type='int',
+ help='Try finding negative in same gene as positive. [Default: %default]')
+ parser.add_option('--match_allele', dest='match_allele',
+ default=0, type='int',
+ help='Try finding negative with same ref and alt alleles. [Default: %default]')
+ parser.add_option('-o', dest='out_prefix',
+ default='qtlcat_paqtl')
+ (options,args) = parser.parse_args()
+
+ tissue_name = options.out_prefix.split('txrev_')[1]
+
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf'
+
+ # read variant table
+ qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_polya_negatives.bed.gz", sep='\t')
+ qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_polya_positives.bed.gz", sep='\t')
+
+ # read TPM bin table and construct lookup dictionaries
+ tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']]
+ gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index')
+
+ # filter on SNPs with genes in TPM bin dict
+ qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+ qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+
+ #Load gene span annotation (protein-coding/categorized only)
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str'])
+ gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist()
+
+ # filter on SNPs with genes in GTF file
+ qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+ qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+
+ bin_to_genes_dict = {}
+ for _, row in tpm_df.iterrows() :
+
+ if row['bin_index'] not in bin_to_genes_dict :
+ bin_to_genes_dict[row['bin_index']] = []
+
+ bin_to_genes_dict[row['bin_index']].append(row['gene_id'])
+
+ for sample_bin in bin_to_genes_dict :
+ bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin])
+
+ # split molecular trait id and filter for polyadenylation (for positives)
+ qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id]
+ qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id]
+
+ qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'downstream']
+ qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'pas_dist'})
+
+ qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.downstream.negative"
+ qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id']
+ qtlcat_df_neg['event'] = 'downstream'
+ qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'pas_dist'})
+
+ paqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True)
+
+ # determine positive variants
+ paqtl_pos_df = paqtl_df[paqtl_df.pip >= options.pos_pip]
+ paqtl_neg_df = paqtl_df[paqtl_df.pip < options.neg_pip]
+ pos_variants = set(paqtl_pos_df.variant)
+
+ neg_gene_and_allele_variants = 0
+ neg_gene_variants = 0
+
+ neg_expr_and_allele_variants = 0
+ neg_expr_variants = 0
+
+ unmatched_variants = 0
+
+ # choose negative variants
+ neg_variants = set()
+ neg_dict = {}
+ for pvariant in tqdm(pos_variants):
+ paqtl_this_df = paqtl_pos_df[paqtl_pos_df.variant == pvariant]
+
+ neg_found = False
+
+ # optionally prefer negative from positive's gene set
+ if options.match_gene == 1 and options.match_allele == 1 :
+ pgenes = set(paqtl_this_df.gene)
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+
+ if neg_found :
+ neg_gene_and_allele_variants += 1
+
+ if not neg_found and options.match_gene == 1 :
+ pgenes = set(paqtl_this_df.gene)
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+
+ if neg_found :
+ neg_gene_variants += 1
+
+ if not neg_found and options.match_allele == 1 :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+
+ if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+
+ if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, True)
+
+ if neg_found :
+ neg_expr_and_allele_variants += 1
+
+ if not neg_found :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+
+ if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_l']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+
+ if not neg_found and gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[paqtl_this_df.iloc[0].gene]['bin_index_r']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, False)
+
+ if neg_found :
+ neg_expr_variants += 1
+
+ if not neg_found :
+ print("[Warning] Could not find a matching negative for '" + pvariant + "'")
+ unmatched_variants += 1
+
+ print('%d positive variants' % len(pos_variants))
+ print('%d negative variants' % len(neg_variants))
+ print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants)
+ print(' - %d gene-matched negatives ' % neg_gene_variants)
+ print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants)
+ print(' - %d expr-matched negatives ' % neg_expr_variants)
+ print(' - %d unmatched negatives ' % unmatched_variants)
+
+ pos_dict = {pv: pv for pv in pos_variants}
+
+ # write VCFs
+ write_vcf('%s_pos.vcf' % options.out_prefix, paqtl_df, pos_variants, pos_dict)
+ write_vcf('%s_neg.vcf' % options.out_prefix, paqtl_df, neg_variants, neg_dict)
+
+def find_negative(neg_variants, neg_dict, pos_variants, paqtl_this_df, paqtl_neg_df, pgenes, match_allele) :
+
+ gene_mask = np.array([gene in pgenes for gene in paqtl_neg_df.gene])
+ paqtl_neg_gene_df = paqtl_neg_df[gene_mask]
+
+ # match PAS distance
+ this_dist = paqtl_this_df.iloc[0].pas_dist
+ dist_cmp = np.abs(paqtl_neg_gene_df.pas_dist - this_dist)
+ dist_cmp_unique = np.sort(np.unique(dist_cmp.values))
+
+ this_ref = paqtl_this_df.iloc[0].ref
+ this_alt = paqtl_this_df.iloc[0].alt
+
+ for ni_unique in dist_cmp_unique:
+
+ paqtl_neg_gene_dist_df = paqtl_neg_gene_df.loc[dist_cmp == ni_unique]
+
+ shuffle_index = np.arange(len(paqtl_neg_gene_dist_df), dtype='int32')
+ np.random.shuffle(shuffle_index)
+
+ for npaqtl_i in range(len(paqtl_neg_gene_dist_df)) :
+ npaqtl = paqtl_neg_gene_dist_df.iloc[shuffle_index[npaqtl_i]]
+
+ if not match_allele or (npaqtl.ref == this_ref and npaqtl.alt == this_alt):
+ if npaqtl.variant not in neg_variants and npaqtl.variant not in pos_variants:
+
+ neg_variants.add(npaqtl.variant)
+ neg_dict[npaqtl.variant] = paqtl_this_df.iloc[0].variant
+
+ return True
+
+ return False
+
+def write_vcf(vcf_file, df, variants_write, variants_dict):
+ vcf_open = open(vcf_file, 'w')
+ print('##fileformat=VCFv4.2', file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
+ print('\t'.join(cols), file=vcf_open)
+
+ variants_written = set()
+
+ for v in df.itertuples():
+ if v.variant in variants_write and v.variant not in variants_written:
+ cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.']
+ cols += ['MT=%s;PD=%d;PI=%s' % (v.molecular_trait_id, v.pas_dist, variants_dict[v.variant])]
+ print('\t'.join(cols), file=vcf_open)
+ variants_written.add(v.variant)
+
+ vcf_open.close()
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/sqtl_make_negative_sets.py b/src/scripts/data/qtl_data/sqtl_make_negative_sets.py
new file mode 100644
index 0000000..7518ca4
--- /dev/null
+++ b/src/scripts/data/qtl_data/sqtl_make_negative_sets.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+sqtl_make_negative_sets.py
+
+Build tables with negative (non-causal) SNPs for sQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ #Parameters
+ pip_cutoff = 0.01
+ max_distance = 10000
+ gene_pad = 50
+ splice_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_protein_splice.gff'
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+ finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz'
+
+ #Define tissues
+ tissue_names = [
+ 'adipose_subcutaneous',
+ 'adipose_visceral',
+ 'adrenal_gland',
+ 'artery_aorta',
+ 'artery_coronary',
+ 'artery_tibial',
+ 'blood',
+ 'brain_amygdala',
+ 'brain_anterior_cingulate_cortex',
+ 'brain_caudate',
+ 'brain_cerebellar_hemisphere',
+ 'brain_cerebellum',
+ 'brain_cortex',
+ 'brain_frontal_cortex',
+ 'brain_hippocampus',
+ 'brain_hypothalamus',
+ 'brain_nucleus_accumbens',
+ 'brain_putamen',
+ 'brain_spinal_cord',
+ 'brain_substantia_nigra',
+ 'breast',
+ 'colon_sigmoid',
+ 'colon_transverse',
+ 'esophagus_gej',
+ 'esophagus_mucosa',
+ 'esophagus_muscularis',
+ 'fibroblast',
+ 'heart_atrial_appendage',
+ 'heart_left_ventricle',
+ 'kidney_cortex',
+ 'LCL',
+ 'liver',
+ 'lung',
+ 'minor_salivary_gland',
+ 'muscle',
+ 'nerve_tibial',
+ 'ovary',
+ 'pancreas',
+ 'pituitary',
+ 'prostate',
+ 'skin_not_sun_exposed',
+ 'skin_sun_exposed',
+ 'small_intestine',
+ 'spleen',
+ 'stomach',
+ 'testis',
+ 'thyroid',
+ 'uterus',
+ 'vagina',
+ ]
+
+ #Compile negative SNP set for each tissue
+ for tissue_name in tissue_names :
+
+ print("-- " + str(tissue_name) + " --")
+
+ #Load summary stats and extract unique set of SNPs
+ vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+
+ #Only keep SNPs (no indels)
+ vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+ vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+ vcf_df['start'] = vcf_df['position'].astype(int)
+ vcf_df['end'] = vcf_df['start'] + 1
+ vcf_df['strand'] = "."
+
+ vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']]
+ vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+ print("len(vcf_df) = " + str(len(vcf_df)))
+
+ #Store intermediate SNPs
+ #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False)
+
+ #Load splice site annotation
+ splice_df = pd.read_csv(splice_file, sep='\t', names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'], usecols=['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand'])[['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand']]
+
+ #Load gene span annotation
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+ gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+ gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+ gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+ gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+ gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+
+ #Join dataframes against gtf annotation
+ splice_pr = pr.PyRanges(splice_df)
+ gtf_pr = pr.PyRanges(gtf_df)
+ vcf_pr = pr.PyRanges(vcf_df)
+
+ splice_gtf_pr = splice_pr.join(gtf_pr, strandedness='same')
+ vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+ splice_gtf_df = splice_gtf_pr.df[['Chromosome', 'Start', 'End', 'feature', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+ vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True)
+
+ splice_gtf_df['Start'] -= max_distance
+ splice_gtf_df['End'] += max_distance
+
+ #Join vcf against splice annotation
+ splice_gtf_pr = pr.PyRanges(splice_gtf_df)
+ vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+ vcf_splice_pr = vcf_gtf_pr.join(splice_gtf_pr, strandedness=False)
+
+ #Force gene_id of SNP to be same as the gene_id of the splice site
+ vcf_splice_df = vcf_splice_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+ vcf_splice_df = vcf_splice_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'feature', 'Strand_b', 'Start_b']]
+
+ #Splice site position
+ vcf_splice_df['Start_b'] += max_distance
+ vcf_splice_df = vcf_splice_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'splice_pos', 'Strand_b' : 'Strand'})
+
+ #Distance to splice site
+ vcf_splice_df['distance'] = np.abs(vcf_splice_df['Pos'] - vcf_splice_df['splice_pos'])
+
+ #Choose unique SNPs by shortest distance to splice site
+ vcf_splice_df = vcf_splice_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+ vcf_splice_df = vcf_splice_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+ vcf_df_filtered = vcf_splice_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+ vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'feature', 'strand', 'splice_pos', 'distance']]
+
+ print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+
+ #Store intermediate SNPs (filtered)
+ vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_splice_filtered.bed.gz", sep='\t', index=False)
+
+ #Reload filtered SNP file
+ vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_splice_filtered.bed.gz", sep='\t', compression='gzip')
+
+ #Create variant identifier
+ vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt']
+
+ #Load merged fine-mapping dataframe
+ finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']]
+
+ #Join against fine-mapping dataframe
+ neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left')
+ neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0.
+
+ #Only keep SNPs with PIP < cutoff
+ neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+ #Store final table of negative SNPs
+ neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_splice_negatives.bed.gz", sep='\t', index=False)
+
+ print("len(neg_df) = " + str(len(neg_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/sqtl_make_positive_sets.py b/src/scripts/data/qtl_data/sqtl_make_positive_sets.py
new file mode 100644
index 0000000..954ab7e
--- /dev/null
+++ b/src/scripts/data/qtl_data/sqtl_make_positive_sets.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+sqtl_make_positive_sets.py
+
+Build tables with positive (causal) SNPs for sQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options] arg'
+ parser = OptionParser(usage)
+ #parser.add_option()
+ (options,args) = parser.parse_args()
+
+ #Parameters
+ pip_cutoff = 0.01
+ max_distance = 10000
+ gene_pad = 50
+ splice_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_protein_splice.gff'
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+
+ #Define tissues
+ tissue_names = [
+ 'adipose_subcutaneous',
+ 'adipose_visceral',
+ 'adrenal_gland',
+ 'artery_aorta',
+ 'artery_coronary',
+ 'artery_tibial',
+ 'blood',
+ 'brain_amygdala',
+ 'brain_anterior_cingulate_cortex',
+ 'brain_caudate',
+ 'brain_cerebellar_hemisphere',
+ 'brain_cerebellum',
+ 'brain_cortex',
+ 'brain_frontal_cortex',
+ 'brain_hippocampus',
+ 'brain_hypothalamus',
+ 'brain_nucleus_accumbens',
+ 'brain_putamen',
+ 'brain_spinal_cord',
+ 'brain_substantia_nigra',
+ 'breast',
+ 'colon_sigmoid',
+ 'colon_transverse',
+ 'esophagus_gej',
+ 'esophagus_mucosa',
+ 'esophagus_muscularis',
+ 'fibroblast',
+ 'heart_atrial_appendage',
+ 'heart_left_ventricle',
+ 'kidney_cortex',
+ 'LCL',
+ 'liver',
+ 'lung',
+ 'minor_salivary_gland',
+ 'muscle',
+ 'nerve_tibial',
+ 'ovary',
+ 'pancreas',
+ 'pituitary',
+ 'prostate',
+ 'skin_not_sun_exposed',
+ 'skin_sun_exposed',
+ 'small_intestine',
+ 'spleen',
+ 'stomach',
+ 'testis',
+ 'thyroid',
+ 'uterus',
+ 'vagina',
+ ]
+
+ #Compile positive SNP set for each tissue
+ for tissue_name in tissue_names :
+
+ print("-- " + str(tissue_name) + " --")
+
+ #Load fine-mapping table
+ vcf_df = pd.read_csv("txrev/GTEx_txrev_" + tissue_name + ".purity_filtered.txt.gz", sep='\t', usecols=['chromosome', 'position', 'ref', 'alt', 'variant', 'pip', 'molecular_trait_id'], low_memory=False)
+
+ #Only keep SNPs (no indels)
+ vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+ #Only keep SNPs associated with splice events
+ vcf_df = vcf_df.loc[vcf_df['molecular_trait_id'].str.contains(".contained.")].copy().reset_index(drop=True)
+
+ vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+ vcf_df['start'] = vcf_df['position'].astype(int)
+ vcf_df['end'] = vcf_df['start'] + 1
+ vcf_df['strand'] = "."
+
+ vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand', 'variant', 'pip', 'molecular_trait_id']]
+ vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+ print("len(vcf_df) = " + str(len(vcf_df)))
+
+ #Load splice site annotation
+ splice_df = pd.read_csv(splice_file, sep='\t', names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'], usecols=['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand'])[['Chromosome', 'Start', 'End', 'feature', 'feat1', 'Strand']]
+
+ #Load gene span annotation
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+ gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+ gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+ gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+ gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+ gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+
+ #Join dataframes against gtf annotation
+ splice_pr = pr.PyRanges(splice_df)
+ gtf_pr = pr.PyRanges(gtf_df)
+ vcf_pr = pr.PyRanges(vcf_df)
+
+ splice_gtf_pr = splice_pr.join(gtf_pr, strandedness='same')
+ vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+ splice_gtf_df = splice_gtf_pr.df[['Chromosome', 'Start', 'End', 'feature', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+ vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id', 'variant', 'pip', 'molecular_trait_id']].copy().reset_index(drop=True)
+
+ splice_gtf_df['Start'] -= max_distance
+ splice_gtf_df['End'] += max_distance
+
+ #Join vcf against splice annotation
+ splice_gtf_pr = pr.PyRanges(splice_gtf_df)
+ vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+ vcf_splice_pr = vcf_gtf_pr.join(splice_gtf_pr, strandedness=False)
+
+ #Force gene_id of SNP to be same as the gene_id of the splice site
+ vcf_splice_df = vcf_splice_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+ vcf_splice_df = vcf_splice_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'feature', 'Strand_b', 'Start_b', 'variant', 'pip', 'molecular_trait_id']]
+
+ #Force gene_id of SNP to be same as the gene_id of the finemapped molecular trait
+ vcf_splice_df['molecular_trait_gene_id'] = vcf_splice_df['molecular_trait_id'].apply(lambda x: x.split(".")[0])
+ vcf_splice_df = vcf_splice_df.query("gene_id == molecular_trait_gene_id").copy().reset_index(drop=True)
+
+ #Splice site position
+ vcf_splice_df['Start_b'] += max_distance
+ vcf_splice_df = vcf_splice_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'splice_pos', 'Strand_b' : 'Strand'})
+
+ #Distance to splice site
+ vcf_splice_df['distance'] = np.abs(vcf_splice_df['Pos'] - vcf_splice_df['splice_pos'])
+
+ #Choose unique SNPs by shortest distance to splice site (and inverse PIP for tie-breaking)
+ vcf_splice_df['pip_inv'] = 1. - vcf_splice_df['pip']
+
+ vcf_splice_df = vcf_splice_df.sort_values(by=['distance', 'pip_inv'], ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+ vcf_splice_df = vcf_splice_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+ vcf_df_filtered = vcf_splice_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+ vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'feature', 'strand', 'splice_pos', 'distance', 'variant', 'pip', 'molecular_trait_id']]
+
+ print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+
+ #Store intermediate SNPs (filtered)
+ vcf_df_filtered.to_csv("txrev/GTEx_snps_" + tissue_name + "_splice_finemapped_filtered.bed.gz", sep='\t', index=False)
+
+ #Reload filtered SNP file
+ vcf_df_filtered = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_splice_finemapped_filtered.bed.gz", sep='\t', compression='gzip')
+
+ #Only keep SNPs with PIP > cutoff
+ pos_df = vcf_df_filtered.query("pip > " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+ #Store final table of positive SNPs
+ pos_df.to_csv("txrev/GTEx_snps_" + tissue_name + "_splice_positives.bed.gz", sep='\t', index=False)
+
+ print("len(pos_df) = " + str(len(pos_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/qtl_data/sqtl_vcfs.py b/src/scripts/data/qtl_data/sqtl_vcfs.py
new file mode 100644
index 0000000..d275a76
--- /dev/null
+++ b/src/scripts/data/qtl_data/sqtl_vcfs.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+import os
+import pdb
+import time
+
+import numpy as np
+import pandas as pd
+import pyranges as pr
+from tqdm import tqdm
+
+'''
+sqtl_vcfs.py
+
+Generate positive and negative sQTL sets from the QTL catalog txrevise.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+ usage = 'usage: %prog [options]'
+ parser = OptionParser(usage)
+ parser.add_option('--neg_pip', dest='neg_pip',
+ default=0.01, type='float',
+ help='PIP upper limit for negative examples. [Default: %default]')
+ parser.add_option('--pos_pip', dest='pos_pip',
+ default=0.9, type='float',
+ help='PIP lower limit for positive examples. [Default: %default]')
+ parser.add_option('--match_gene', dest='match_gene',
+ default=0, type='int',
+ help='Try finding negative in same gene as positive. [Default: %default]')
+ parser.add_option('--match_allele', dest='match_allele',
+ default=0, type='int',
+ help='Try finding negative with same ref and alt alleles. [Default: %default]')
+ parser.add_option('-o', dest='out_prefix',
+ default='qtlcat_sqtl')
+ (options,args) = parser.parse_args()
+
+ tissue_name = options.out_prefix.split('txrev_')[1]
+
+ gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort_protein.gtf'
+
+ # read variant table
+ qtlcat_df_neg = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_splice_negatives.bed.gz", sep='\t')
+ qtlcat_df_pos = pd.read_csv("txrev/GTEx_snps_" + tissue_name + "_splice_positives.bed.gz", sep='\t')
+
+ # read TPM bin table and construct lookup dictionaries
+ tpm_df = pd.read_csv('ge/GTEx_ge_' + tissue_name + "_tpms.csv", sep='\t')[['gene_id', 'tpm', 'bin_index', 'bin_index_l', 'bin_index_r']]
+ gene_to_tpm_dict = tpm_df.set_index('gene_id').to_dict(orient='index')
+
+ # filter on SNPs with genes in TPM bin dict
+ qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+ qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(tpm_df['gene_id'].values.tolist())].copy().reset_index(drop=True)
+
+ #Load gene span annotation (protein-coding/categorized only)
+ gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['id_str'])
+ gtf_genes = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]).unique().tolist()
+
+ # filter on SNPs with genes in GTF file
+ qtlcat_df_neg = qtlcat_df_neg.loc[qtlcat_df_neg['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+ qtlcat_df_pos = qtlcat_df_pos.loc[qtlcat_df_pos['gene_id'].isin(gtf_genes)].copy().reset_index(drop=True)
+
+ bin_to_genes_dict = {}
+ for _, row in tpm_df.iterrows() :
+
+ if row['bin_index'] not in bin_to_genes_dict :
+ bin_to_genes_dict[row['bin_index']] = []
+
+ bin_to_genes_dict[row['bin_index']].append(row['gene_id'])
+
+ for sample_bin in bin_to_genes_dict :
+ bin_to_genes_dict[sample_bin] = set(bin_to_genes_dict[sample_bin])
+
+ # split molecular trait id and filter for polyadenylation (for positives)
+ qtlcat_df_pos['gene'] = [mti.split('.')[0] for mti in qtlcat_df_pos.molecular_trait_id]
+ qtlcat_df_pos['event'] = [mti.split('.')[2] for mti in qtlcat_df_pos.molecular_trait_id]
+
+ qtlcat_df_pos = qtlcat_df_pos[qtlcat_df_pos.event == 'contained']
+ qtlcat_df_pos = qtlcat_df_pos.rename(columns={'distance' : 'splice_dist'})
+
+ qtlcat_df_neg['molecular_trait_id'] = qtlcat_df_neg['gene_id'] + "." + "grp_0.contained.negative"
+ qtlcat_df_neg['gene'] = qtlcat_df_neg['gene_id']
+ qtlcat_df_neg['event'] = 'contained'
+ qtlcat_df_neg = qtlcat_df_neg.rename(columns={'distance' : 'splice_dist'})
+
+ sqtl_df = pd.concat([qtlcat_df_neg, qtlcat_df_pos]).copy().reset_index(drop=True)
+
+ # determine positive variants
+ sqtl_pos_df = sqtl_df[sqtl_df.pip >= options.pos_pip]
+ sqtl_neg_df = sqtl_df[sqtl_df.pip < options.neg_pip]
+ pos_variants = set(sqtl_pos_df.variant)
+
+ neg_gene_and_allele_variants = 0
+ neg_gene_variants = 0
+
+ neg_expr_and_allele_variants = 0
+ neg_expr_variants = 0
+
+ unmatched_variants = 0
+
+ # choose negative variants
+ neg_variants = set()
+ neg_dict = {}
+ for pvariant in tqdm(pos_variants):
+ sqtl_this_df = sqtl_pos_df[sqtl_pos_df.variant == pvariant]
+
+ neg_found = False
+
+ # optionally prefer negative from positive's gene set
+ if options.match_gene == 1 and options.match_allele == 1 :
+ pgenes = set(sqtl_this_df.gene)
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True)
+
+ if neg_found :
+ neg_gene_and_allele_variants += 1
+
+ if not neg_found and options.match_gene == 1 :
+ pgenes = set(sqtl_this_df.gene)
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False)
+
+ if neg_found :
+ neg_gene_variants += 1
+
+ if not neg_found and options.match_allele == 1 :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True)
+
+ if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True)
+
+ if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, True)
+
+ if neg_found :
+ neg_expr_and_allele_variants += 1
+
+ if not neg_found :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False)
+
+ if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_l']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False)
+
+ if not neg_found and gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index'] != gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r'] :
+ pgenes = bin_to_genes_dict[gene_to_tpm_dict[sqtl_this_df.iloc[0].gene]['bin_index_r']]
+ neg_found = find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, False)
+
+ if neg_found :
+ neg_expr_variants += 1
+
+ if not neg_found :
+ print("[Warning] Could not find a matching negative for '" + pvariant + "'")
+ unmatched_variants += 1
+
+ print('%d positive variants' % len(pos_variants))
+ print('%d negative variants' % len(neg_variants))
+ print(' - %d gene-matched negatives with same alleles' % neg_gene_and_allele_variants)
+ print(' - %d gene-matched negatives ' % neg_gene_variants)
+ print(' - %d expr-matched negatives with same alleles' % neg_expr_and_allele_variants)
+ print(' - %d expr-matched negatives ' % neg_expr_variants)
+ print(' - %d unmatched negatives ' % unmatched_variants)
+
+ pos_dict = {pv: pv for pv in pos_variants}
+
+ # write VCFs
+ write_vcf('%s_pos.vcf' % options.out_prefix, sqtl_df, pos_variants, pos_dict)
+ write_vcf('%s_neg.vcf' % options.out_prefix, sqtl_df, neg_variants, neg_dict)
+
+def find_negative(neg_variants, neg_dict, pos_variants, sqtl_this_df, sqtl_neg_df, pgenes, match_allele) :
+
+ gene_mask = np.array([gene in pgenes for gene in sqtl_neg_df.gene])
+ sqtl_neg_gene_df = sqtl_neg_df[gene_mask]
+
+ # match PAS distance
+ this_dist = sqtl_this_df.iloc[0].splice_dist
+ dist_cmp = np.abs(sqtl_neg_gene_df.splice_dist - this_dist)
+ dist_cmp_unique = np.sort(np.unique(dist_cmp.values))
+
+ this_ref = sqtl_this_df.iloc[0].ref
+ this_alt = sqtl_this_df.iloc[0].alt
+
+ for ni_unique in dist_cmp_unique:
+
+ sqtl_neg_gene_dist_df = sqtl_neg_gene_df.loc[dist_cmp == ni_unique]
+
+ shuffle_index = np.arange(len(sqtl_neg_gene_dist_df), dtype='int32')
+ np.random.shuffle(shuffle_index)
+
+ for nsqtl_i in range(len(sqtl_neg_gene_dist_df)) :
+ nsqtl = sqtl_neg_gene_dist_df.iloc[shuffle_index[nsqtl_i]]
+
+ if not match_allele or (nsqtl.ref == this_ref and nsqtl.alt == this_alt):
+ if nsqtl.variant not in neg_variants and nsqtl.variant not in pos_variants:
+
+ neg_variants.add(nsqtl.variant)
+ neg_dict[nsqtl.variant] = sqtl_this_df.iloc[0].variant
+
+ return True
+
+ return False
+
+def write_vcf(vcf_file, df, variants_write, variants_dict):
+ vcf_open = open(vcf_file, 'w')
+ print('##fileformat=VCFv4.2', file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ print('##INFO=',
+ file=vcf_open)
+ cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
+ print('\t'.join(cols), file=vcf_open)
+
+ variants_written = set()
+
+ for v in df.itertuples():
+ if v.variant in variants_write and v.variant not in variants_written:
+ cols = [v.chrom, str(v.pos), v.variant, v.ref, v.alt, '.', '.']
+ cols += ['MT=%s;SD=%d;PI=%s' % (v.molecular_trait_id, v.splice_dist, variants_dict[v.variant])]
+ print('\t'.join(cols), file=vcf_open)
+ variants_written.add(v.variant)
+
+ vcf_open.close()
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+ main()
diff --git a/src/scripts/data/training_data/Makefile b/src/scripts/data/training_data/Makefile
new file mode 100644
index 0000000..170222b
--- /dev/null
+++ b/src/scripts/data/training_data/Makefile
@@ -0,0 +1,47 @@
+FASTA_HUMAN=$$HG38/assembly/ucsc/hg38.ml.fa
+GAPS_HUMAN=$$HG38/assembly/ucsc/hg38_gaps.bed
+UMAP_HUMAN=$$HG38//mappability/umap_k36_t10_l32.bed
+BLACK_HUMAN=$$HG38/blacklist/blacklist_hg38_all.bed
+
+FASTA_MOUSE=$$MM10/assembly/ucsc/mm10.ml.fa
+GAPS_MOUSE=$$MM10/assembly/ucsc/mm10_gaps.bed
+UMAP_MOUSE=$$MM10//mappability/umap_k36_t10_l32.bed
+BLACK_MOUSE=$$MM10/blacklist/blacklist_mm10_all.bed
+
+ALIGN=$$HG38/align/hg38.mm10.syn.net.gz
+
+OUT=/scratch3/drk/seqnn/data/v9
+
+# LENGTH=393216
+# TSTRIDE=43691 # (393216-2*131072)/3
+# CROP=131072
+
+LENGTH=524288
+TSTRIDE=49173 # (524288-2*163840)/4 + 21
+CROP=163840
+WIDTH=32
+FOLDS=8
+
+AOPTS=--break 2097152 -c $(CROP) --nf 524288 --no 393216 -l $(LENGTH) --stride $(TSTRIDE) -f $(FOLDS) --umap_t 0.5 -w $(WIDTH)
+DOPTS=-c $(CROP) -d 2 -f $(FOLDS) -l $(LENGTH) -p 64 -r 16 --umap_clip 0.5 -w $(WIDTH)
+
+
+all: $(OUT)/hg38/tfrecords/train-0.tfr $(OUT)/mm10/tfrecords/train-0.tfr
+
+umap_human.bed:
+ cat $(UMAP_HUMAN) $(BLACK_HUMAN) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_human.bed
+
+umap_mouse.bed:
+ cat $(UMAP_MOUSE) $(BLACK_MOUSE) | awk 'BEGIN {OFS="\t"} {print $$1, $$2, $$3}' | bedtools sort -i - | bedtools merge -i - > umap_mouse.bed
+
+targets_human.txt targets_mouse.txt:
+ ./make_targets.py
+
+$(OUT)/hg38/sequences.bed $(OUT)/mm10/sequences.bed: umap_human.bed umap_mouse.bed
+ basenji_data_align.py -a hg38,mm10 -g $(GAPS_HUMAN),$(GAPS_MOUSE) -u umap_human.bed,umap_mouse.bed $(AOPTS) -o $(OUT) $(ALIGN) $(FASTA_HUMAN),$(FASTA_MOUSE)
+
+$(OUT)/hg38/tfrecords/train-0.tfr: $(OUT)/hg38/sequences.bed targets_human.txt
+ basenji_data.py --restart $(DOPTS) -b $(BLACK_HUMAN) -o $(OUT)/hg38 $(FASTA_HUMAN) -u umap_human.bed targets_human.txt
+
+$(OUT)/mm10/tfrecords/train-0.tfr: $(OUT)/mm10/sequences.bed targets_mouse.txt
+ basenji_data.py --restart $(DOPTS) -b $(BLACK_MOUSE) -o $(OUT)/mm10 $(FASTA_MOUSE) -u umap_mouse.bed targets_mouse.txt
diff --git a/src/scripts/data/training_data/README.md b/src/scripts/data/training_data/README.md
new file mode 100644
index 0000000..7c2751e
--- /dev/null
+++ b/src/scripts/data/training_data/README.md
@@ -0,0 +1,11 @@
+## Data processing & Training
+
+Processing of ENCODE, GTEx, FANTOM5, and CATlas training data is done through a Makefile. It requires a number of auxiliary files (e.g. genome alignments), which can be downloaded from the Borzoi training data bucket [here](https://storage.googleapis.com/borzoi-paper/data/) (GCP).
+
+The Makefile relies on the script 'basenji_data.py' from the [basenji repository](https://github.com/calico/basenji/blob/master/bin/basenji_data.py), which in turn calls the scripts 'basenji_data_read.py' and 'basenji_data_write.py' from the same repo, in order to (1) read coverage data (from bigwig-like files) along with a matched segment from a fasta genome file, and (2) write the (one-hot coded) sequence along with coverage values into compressed TF records.
+
+*Notes*:
+- The attached Makefile shows the exact commands used to call basenji_data.py and other related scripts to create the specific training data for the published model.
+- The script(s) take as input a fasta genome file, optional blacklist+unmappable region files, as well as a .txt file where each row points to a bigwig coverage file location (see for [this file](https://raw.githubusercontent.com/calico/borzoi/main/examples/targets_human.txt)).
+
+The model training is done through the script 'hound_train.py' from the [baskerville repository](https://github.com/calico/baskerville/blob/main/src/baskerville/scripts/hound_train.py). Most of the training parameters are set through a .json file that is supplied to the script. The published model's parameter file can be found [here](https://storage.googleapis.com/seqnn-share/borzoi/params.json).