-
Notifications
You must be signed in to change notification settings - Fork 0
/
HAPPI_GWAS.py
666 lines (578 loc) · 30.2 KB
/
HAPPI_GWAS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
#!/usr/bin/env python3
import sys
import os
import re
import json
import gzip
import pathlib
import shlex
import subprocess
import argparse
import pandas as pd
def main(args):
#######################################################################
# Get arguments
#######################################################################
project_name = args.project_name
workflow_path = args.workflow_path
input_folder = args.input_folder
output_folder = args.output_folder
vcf_file = args.vcf_file
gff_file = args.gff_file
gff_category = args.gff_category
gff_key = args.gff_key
genotype_hapmap = args.genotype_hapmap
genotype_data = args.genotype_data
genotype_map = args.genotype_map
kinship = args.kinship
z_matrix = args.z_matrix
corvariance_matrix = args.corvariance_matrix
snp_maf = args.snp_maf
model = args.model
pca_total = args.pca_total
ulimit = args.ulimit
memory = args.memory
threads = args.threads
keep_going = args.keep_going
jobs = args.jobs
latency_wait = args.latency_wait
cluster = args.cluster
p_value_filter = args.p_value_filter
fdr_corrected_p_value_filter = args.fdr_corrected_p_value_filter
ld_length = args.ld_length
#######################################################################
# Check mandatory and at-least-one arguments
#######################################################################
# Project name checking
if project_name is None:
print('Project name is missing!!!')
sys.exit(1)
if project_name == '':
print('Project name is missing!!!')
sys.exit(1)
# Workflow path checking
if workflow_path is None:
print('Workflow path is missing!!!')
sys.exit(1)
if workflow_path == '':
print('Workflow path is missing!!!')
sys.exit(1)
if not workflow_path.exists():
print('Workflow path does not exist!!!')
sys.exit(1)
# Input folder checking
if input_folder is None:
print('Input folder is missing!!!')
sys.exit(1)
if input_folder == '':
print('Input folder is missing!!!')
sys.exit(1)
if not input_folder.exists():
print('Input folder does not exist!!!')
sys.exit(1)
# VCF file checking
if vcf_file is None:
print('VCF file is missing!!!')
sys.exit(1)
if vcf_file == '':
print('VCF file is missing!!!')
sys.exit(1)
if not vcf_file.exists():
print('VCF file does not exist!!!')
sys.exit(1)
# GFF file checking
if gff_file is None:
print('GFF file is missing!!!')
sys.exit(1)
if gff_file == '':
print('GFF file is missing!!!')
sys.exit(1)
if not gff_file.exists():
print('GFF file does not exist!!!')
sys.exit(1)
# At-least-one arguments checking
if genotype_hapmap != 'NULL':
try:
genotype_hapmap = pathlib.Path(genotype_hapmap)
if not genotype_hapmap.exists():
genotype_hapmap = 'NULL'
except Exception as e:
genotype_hapmap = 'NULL'
print(e)
if genotype_data != 'NULL':
try:
genotype_data = pathlib.Path(genotype_data)
if not genotype_data.exists():
genotype_data = 'NULL'
except Exception as e:
genotype_data = 'NULL'
print(e)
if genotype_map != 'NULL':
try:
genotype_map = pathlib.Path(genotype_map)
if not genotype_map.exists():
genotype_map = 'NULL'
except Exception as e:
genotype_map = 'NULL'
print(e)
if genotype_hapmap == 'NULL' and genotype_data == 'NULL' and genotype_map == 'NULL':
print('Genotype hapmap, genotype data, and genotype map are not provided!!!')
sys.exit(1)
#######################################################################
# Check if output parent folder exists
# If not, create the output parent folder
#######################################################################
if not output_folder.exists():
try:
output_folder.mkdir(parents=True)
except FileNotFoundError as e:
pass
except FileExistsError as e:
pass
except Exception as e:
pass
if not output_folder.exists():
print("Output folder does not exists!!!")
sys.exit(1)
# Create GAPIT output folder
gapit_output_folder = output_folder.joinpath('GAPIT', model)
if not gapit_output_folder.exists():
try:
gapit_output_folder.mkdir(parents=True)
except FileNotFoundError as e:
pass
except FileExistsError as e:
pass
except Exception as e:
pass
if not gapit_output_folder.exists():
print("GAPIT output folder does not exists!!!")
sys.exit(1)
# Create VCFtools_Haploview output folder
vcftools_haploview_output_folder = output_folder.joinpath('VCFtools_Haploview')
if not vcftools_haploview_output_folder.exists():
try:
vcftools_haploview_output_folder.mkdir(parents=True)
except FileNotFoundError as e:
pass
except FileExistsError as e:
pass
except Exception as e:
pass
if not vcftools_haploview_output_folder.exists():
print("VCFtools_Haploview output folder does not exists!!!")
sys.exit(1)
#######################################################################
# Generate and save GAPIT configuration file
#######################################################################
gapit_config_data = {
"project_name": project_name,
"workflow_path": str(workflow_path),
"input_folder": str(input_folder),
"output_folder": str(gapit_output_folder),
"genotype_hapmap": str(genotype_hapmap),
"genotype_data": str(genotype_data),
"genotype_map": str(genotype_map),
"kinship": kinship,
"z_matrix": z_matrix,
"corvariance_matrix": corvariance_matrix,
"snp_maf": snp_maf,
"model": model,
"pca_total": pca_total,
"ulimit": ulimit,
"memory": memory,
"threads": threads
}
with open(gapit_output_folder.joinpath("Snakemake_GAPIT_config.json"), 'w', encoding='utf-8') as writer:
json.dump(gapit_config_data, writer, ensure_ascii=False, indent=4)
#######################################################################
# Run GAPIT in Snakemake
#######################################################################
# Construct snakemake command array
snakemake_gapit_command_array = [
"snakemake",
'--jobs', str(jobs),
'--latency-wait', str(latency_wait),
'--printshellcmds',
'--snakefile', str(workflow_path.joinpath('rules', 'r_gapit.smk')),
'--configfile', str(gapit_output_folder.joinpath('Snakemake_GAPIT_config.json'))
]
if keep_going is not None:
if keep_going:
snakemake_gapit_command_array.append('--keep-going')
if cluster is not None:
if cluster != '':
snakemake_gapit_command_array.append('--executor cluster-generic --cluster-generic-submit-cmd')
snakemake_gapit_command_array.append("\"" + str(cluster) + "\"")
# Construct snakemake command string from snakemake command array
snakemake_gapit_command_string = str(' '.join(snakemake_gapit_command_array))
print("\n\n{}\n".format(snakemake_gapit_command_string))
# Split snakemake command string using shlex to prepare for subprocess
snakemake_gapit_command = shlex.split(snakemake_gapit_command_string)
try:
# Run snakemake command using subprocess
snakemake_gapit_outcome = subprocess.run(
snakemake_gapit_command,
capture_output=True,
check=True,
universal_newlines=True
)
with open(gapit_output_folder.joinpath("Snakemake_GAPIT.log"), 'w') as writer:
writer.write(
"\n\nStandard output: \n{} \n\nStandard error: \n{} \n\nCommand: \n{} \n\nReturn code: \n{} \n\n".format(
snakemake_gapit_outcome.stdout, snakemake_gapit_outcome.stderr, snakemake_gapit_outcome.args,
snakemake_gapit_outcome.returncode))
except subprocess.CalledProcessError as e:
print(
"\n\nOutput: \n{} \n\nStandard output: \n{} \n\nStandard error: \n{} \n\nCommand: \n{} \n\nReturn code: \n{} \n\n".format(
e.output, e.stdout, e.stderr, e.cmd, e.returncode))
except Exception as e:
print(e)
#######################################################################
# Combine GAPIT results
#######################################################################
# Specify GAPIT results file and GAPIT result LD region file
gapit_gwas_result_file = gapit_output_folder.joinpath("GAPIT.Association.GWAS_Results." + str(model) + ".All.txt")
gapit_gwas_result_ld_region_file = gapit_output_folder.joinpath(
"GAPIT.Association.GWAS_Results." + str(model) + ".Unique_LD_Regions.txt")
# Data save parameters
header = True
mode = 'w'
# Find GAPIT results files
if gapit_output_folder.exists():
gapit_auto_output_folder = gapit_output_folder.joinpath('GAPIT_auto_output')
if gapit_auto_output_folder.exists():
gwas_result_files = list(gapit_auto_output_folder.glob('GAPIT.Association.GWAS_Results*.csv'))
if len(gwas_result_files) > 0:
for i in range(len(gwas_result_files)):
# Read GWAS result file
dat = pd.read_csv(filepath_or_buffer=gwas_result_files[i])
# Extract the trait from filename
trait = re.sub("(GAPIT.Association.GWAS_Results.)|(.csv)", "", str(gwas_result_files[i].name))
trait = trait.replace(model, '')
trait = trait.lstrip('\\.')
# Add model, trait, ld_number, ld_start, and ld_end columns
dat['Model'] = model
dat['Trait'] = trait
dat["LD_length"] = ld_length
dat["LD_start"] = dat["Pos"] - ld_length
dat["LD_end"] = dat["Pos"] + ld_length
dat.loc[dat.LD_start < 0, 'LD_start'] = 0
# Filter data based on p-value and FDR corrected p-value
dat = dat[dat["P.value"] <= p_value_filter]
dat = dat[dat["H&B.P.Value"] <= fdr_corrected_p_value_filter]
# Check if data is not empty
if dat.shape[0] > 0 and dat.shape[1] > 0:
# Write to one output file to append all data
dat.to_csv(path_or_buf=gapit_gwas_result_file, sep='\t', mode=mode, header=header, index=False)
# Change parameters to make sure data append
if header:
header = False
if mode == 'w':
mode = 'a'
else:
print("No GAPIT GWAS result was created in the analysis!!!")
sys.exit(1)
else:
print("No GAPIT auto output folder was created in the analysis!!!")
sys.exit(1)
else:
print("Output folder does not exists!!!")
sys.exit(1)
#######################################################################
# Query LD regions and drop duplicates
#######################################################################
# Check if GAPIT GWAS result file exists
if gapit_gwas_result_file.exists():
# Read GAPIT GWAS result file
dat = pd.read_table(filepath_or_buffer=gapit_gwas_result_file)
# Check if data is not empty
if dat.shape[0] > 0 and dat.shape[1] > 0:
# Query LD regions
dat = dat[["Chr", "LD_start", "LD_end"]]
# Sort by Chr, LD_start, and LD_end
dat = dat.sort_values(by=['Chr', 'LD_start', 'LD_end'])
# Drop duplicates
dat = dat.drop_duplicates()
# Check if data is not empty
if dat.shape[0] > 0 and dat.shape[1] > 0:
# Save unique LD regions (header is not included in the output file so that snakemake part is easier)
dat.to_csv(path_or_buf=gapit_gwas_result_ld_region_file, sep='\t', mode='w', header=False, index=False)
else:
print("No GAPIT GWAS result was created in the analysis!!!")
sys.exit(1)
else:
print("GAPIT GWAS result file does not exists!!!")
sys.exit(1)
if not gapit_gwas_result_ld_region_file.exists():
print("GAPIT GWAS result LD region file cannot be created as no LD region found!!!")
sys.exit(1)
#######################################################################
# Generate and save VCFtools_Haploview configuration file
#######################################################################
vcftools_haploview_config_data = {
"project_name": project_name,
"workflow_path": str(workflow_path),
"input_file": str(gapit_gwas_result_ld_region_file),
"output_folder": str(vcftools_haploview_output_folder),
"vcf_file": str(vcf_file),
"ulimit": ulimit,
"memory": memory,
"threads": threads
}
with open(vcftools_haploview_output_folder.joinpath("Snakemake_VCFtools_Haploview_config.json"), 'w',
encoding='utf-8') as writer:
json.dump(vcftools_haploview_config_data, writer, ensure_ascii=False, indent=4)
#######################################################################
# Run VCFtools and Haploview in Snakemake
#######################################################################
# Construct snakemake command array
snakemake_vcftools_haploview_command_array = [
"snakemake",
'--jobs', str(jobs),
'--latency-wait', str(latency_wait),
'--printshellcmds',
'--snakefile', str(workflow_path.joinpath('rules', 'vcftools_haploview.smk')),
'--configfile', str(vcftools_haploview_output_folder.joinpath('Snakemake_VCFtools_Haploview_config.json'))
]
if keep_going is not None:
if keep_going:
snakemake_vcftools_haploview_command_array.append('--keep-going')
if cluster is not None:
if cluster != '':
snakemake_vcftools_haploview_command_array.append('--executor cluster-generic --cluster-generic-submit-cmd')
snakemake_vcftools_haploview_command_array.append("\"" + str(cluster) + "\"")
# Construct snakemake command string from snakemake command array
snakemake_vcftools_haploview_command_string = str(' '.join(snakemake_vcftools_haploview_command_array))
print("\n\n{}\n".format(snakemake_vcftools_haploview_command_string))
# Split snakemake command string using shlex to prepare for subprocess
snakemake_vcftools_haploview_command = shlex.split(snakemake_vcftools_haploview_command_string)
try:
# Run snakemake command using subprocess
snakemake_vcftools_haploview_outcome = subprocess.run(
snakemake_vcftools_haploview_command,
capture_output=True,
check=True,
universal_newlines=True
)
with open(vcftools_haploview_output_folder.joinpath("Snakemake_VCFtools_Haploview.log"), 'w') as writer:
writer.write(
"\n\nStandard output: \n{} \n\nStandard error: \n{} \n\nCommand: \n{} \n\nReturn code: \n{} \n\n".format(
snakemake_vcftools_haploview_outcome.stdout, snakemake_vcftools_haploview_outcome.stderr,
snakemake_vcftools_haploview_outcome.args, snakemake_vcftools_haploview_outcome.returncode))
except subprocess.CalledProcessError as e:
print(
"\n\nOutput: \n{} \n\nStandard output: \n{} \n\nStandard error: \n{} \n\nCommand: \n{} \n\nReturn code: \n{} \n\n".format(
e.output, e.stdout, e.stderr, e.cmd, e.returncode))
except Exception as e:
print(e)
#######################################################################
# Extract haploblock regions
#######################################################################
# Create haplotype blocks file and write header
haplotype_blocks_file = vcftools_haploview_output_folder.joinpath("Haplotype_blocks.txt")
with open(haplotype_blocks_file, 'w') as writer:
writer.write("Chromosome\tStart\tEnd\tHaploblock_start\tHaploblock_end\n")
# Find Haploview and VCFtools output folders
haploview_output_folder = vcftools_haploview_output_folder.joinpath("Haploview")
vcftools_output_folder = vcftools_haploview_output_folder.joinpath("VCFtools")
# Find all GABRIELblocks files
gabrielblocks_files = list(haploview_output_folder.glob("*.GABRIELblocks"))
# Loop through all GABRIELblocks files, extract chromosome, ld_start, and ld_end, and find corresponding recode VCF file
# Extract haploblock start and end based on indexes in GABRIELblocks file and positions in recode VCF file
if len(gabrielblocks_files) > 0:
for i in range(len(gabrielblocks_files)):
gabrielblocks_file = gabrielblocks_files[i]
# Extract file prefix from GABRIELblocks file name
file_prefix = re.sub('.GABRIELblocks', '', str(gabrielblocks_file.name))
# Extract chromosome, start, and end from GABRIELblocks file prefix
chrom_start_end = re.split('__', file_prefix)
chromsome = chrom_start_end[0]
ld_start = chrom_start_end[1]
ld_end = chrom_start_end[2]
# Create recode VCf file path
recode_vcf_file = vcftools_output_folder.joinpath(file_prefix + ".recode.vcf.gz")
if recode_vcf_file.exists():
# Create recode VCF chromosome and position arrays
recode_vcf_chromosome_array = []
recode_vcf_position_array = []
# Read recode VCF file and collect chromosome and position data into arrays
if str(recode_vcf_file).endswith('gz'):
with gzip.open(recode_vcf_file, 'rt') as reader:
header = ""
while not header.strip().startswith("#CHROM"):
header = reader.readline()
header_array = str(header).strip("\n").strip("\r").strip("\r\n").split("\t")
for line in reader:
line_array = str(line).strip("\n").strip("\r").strip("\r\n").split("\t")
recode_vcf_chromosome_array.append(line_array[0])
recode_vcf_position_array.append(line_array[1])
else:
with open(recode_vcf_file, "r") as reader:
header = ""
while not header.strip().startswith("#CHROM"):
header = reader.readline()
header_array = str(header).strip("\n").strip("\r").strip("\r\n").split("\t")
for line in reader:
line_array = str(line).strip("\n").strip("\r").strip("\r\n").split("\t")
recode_vcf_chromosome_array.append(line_array[0])
recode_vcf_position_array.append(line_array[1])
# Create haploblock start and end arrays
haploblock_start_array = []
haploblock_end_array = []
# Read GABRIELblocks file and collect haploblock start and end data into arrays
with open(gabrielblocks_file, "r") as reader:
for line in reader:
if re.search("MARKERS", line) is not None:
line = str(re.sub(".*MARKERS:", "", line)).strip("\n").strip("\r").strip("\r\n").strip(" ")
line_array = re.split(" ", line)
if len(line_array) > 1:
haploblock_start_index = int(float(line_array[0])) - 1
haploblock_end_index = int(float(line_array[(len(line_array) - 1)])) - 1
if (haploblock_start_index > -1) and (haploblock_end_index > -1) and (haploblock_start_index < haploblock_end_index):
haploblock_start = recode_vcf_position_array[haploblock_start_index]
haploblock_end = recode_vcf_position_array[haploblock_end_index]
haploblock_start_array.append(haploblock_start)
haploblock_end_array.append(haploblock_end)
# If haploblock start and end arrays are not empty, write haploblock data to file
if (len(haploblock_start_array) > 0) and (len(haploblock_end_array) > 0) and (
len(haploblock_start_array) == len(haploblock_end_array)):
with open(haplotype_blocks_file, 'a') as writer:
for j in range(len(haploblock_start_array)):
writer.write(
"{}\t{}\t{}\t{}\t{}\n".format(chromsome, ld_start, ld_end, haploblock_start_array[j],
haploblock_end_array[j]))
#######################################################################
# Generate GWAS results file with haploblock data
#######################################################################
# Read data in haplotype blocks file into array
haplotype_blocks_data_array = []
with(open(haplotype_blocks_file, "r")) as reader:
header = reader.readline()
header_array = str(header).strip("\n").strip("\r").strip("\r\n").split("\t")
for line in reader:
line_array = str(line).strip("\n").strip("\r").strip("\r\n").split("\t")
haplotype_blocks_data_array.append(line_array)
# Create GWAS results file with haploblock data
vcftools_haploview_gwas_result_file = vcftools_haploview_output_folder.joinpath(
"GAPIT.Association.GWAS_Results." + str(model) + ".All.txt")
f_hdl = open(vcftools_haploview_gwas_result_file, "w")
# Read GAPIT GWAS results file and add haploblock data
with(open(gapit_gwas_result_file, "r")) as reader:
header = reader.readline()
header = str(header).strip("\n").strip("\r").strip("\r\n") + "\tHaploblock_start\tHaploblock_end\n"
f_hdl.write(header)
for line in reader:
line = str(line).strip("\n").strip("\r").strip("\r\n")
line_array = line.split("\t")
haploblock_start = ""
haploblock_end = ""
if len(haplotype_blocks_data_array) > 0:
for i in range(len(haplotype_blocks_data_array)):
# Match chromosome
if line_array[1] == haplotype_blocks_data_array[i][0]:
# Check if marker position is within haplotyle block region
if int(float(haplotype_blocks_data_array[i][3])) <= int(float(line_array[2])) <= int(float(haplotype_blocks_data_array[i][4])):
haploblock_start = haplotype_blocks_data_array[i][3]
haploblock_end = haplotype_blocks_data_array[i][4]
break
line = line + "\t" + str(haploblock_start) + "\t" + str(haploblock_end) + "\n"
f_hdl.write(line)
f_hdl.close()
#######################################################################
# Extract GFF regions that enclose haplotype blocks or markers
#######################################################################
# Load GFF file into data array
gff_data_array = []
with(open(gff_file, "r")) as reader:
for line in reader:
line = str(line).strip("\n").strip("\r").strip("\r\n")
if not line.startswith("#"):
line_array = line.split("\t")
if line_array[2] == gff_category:
line_array[8] = str(re.sub(';.*', '', re.sub(fr'^.*?{gff_key}', '', line_array[8]))).strip(
"=").strip(":").strip(";").strip("_").strip("-")
gff_data_array.append(line_array)
# Create GWAS results file with haploblock data and GFF regions
output_gwas_result_file = output_folder.joinpath("GAPIT.Association.GWAS_Results." + str(model) + ".All.txt")
f_hdl = open(output_gwas_result_file, "w")
# Read GWAS results file and add GFF regions
with(open(vcftools_haploview_gwas_result_file, "r")) as reader:
header = reader.readline()
header = str(header).strip("\n").strip("\r").strip(
"\r\n") + "\tGene_start\tGene_end\tGene_ID\tOverlap_strategy\n"
f_hdl.write(header)
for line in reader:
line = str(line).strip("\n").strip("\r").strip("\r\n")
line_array = line.split("\t")
gene_start = ""
gene_end = ""
gene_id = ""
overlap_strategy = ""
if len(gff_data_array) > 0:
for i in range(len(gff_data_array)):
# Match chromosome
if line_array[1] == gff_data_array[i][0]:
# Check if haplotype block region exists
if (line_array[13] != '') and (line_array[14] != ''):
if (int(float(gff_data_array[i][3])) <= int(float(line_array[13])) <= int(float(gff_data_array[i][4]))) and (int(float(gff_data_array[i][3])) <= int(float(line_array[14])) <= int(float(gff_data_array[i][4]))):
gene_start = gff_data_array[i][3]
gene_end = gff_data_array[i][4]
gene_id = gff_data_array[i][8]
overlap_strategy = "Full"
if (overlap_strategy != "Full") and (int(float(line_array[13])) < int(float(gff_data_array[i][3]))) and (int(float(gff_data_array[i][3])) <= int(float(line_array[14])) <= int(float(gff_data_array[i][4]))):
gene_start = gff_data_array[i][3]
gene_end = gff_data_array[i][4]
gene_id = gff_data_array[i][8]
overlap_strategy = "Partial"
if (overlap_strategy != "Full") and (int(float(gff_data_array[i][3])) <= int(float(line_array[13])) <= int(float(gff_data_array[i][4]))) and (int(float(line_array[14])) > int(float(gff_data_array[i][4]))):
gene_start = gff_data_array[i][3]
gene_end = gff_data_array[i][4]
gene_id = gff_data_array[i][8]
overlap_strategy = "Partial"
# Check if marker position is within GFF region
if (overlap_strategy == "") and (int(float(gff_data_array[i][3])) <= int(float(line_array[2])) <= int(float(gff_data_array[i][4]))):
gene_start = gff_data_array[i][3]
gene_end = gff_data_array[i][4]
gene_id = gff_data_array[i][8]
overlap_strategy = "Marker"
line = line + "\t" + str(gene_start) + "\t" + str(gene_end) + "\t" + str(gene_id) + "\t" + str(overlap_strategy) + "\n"
f_hdl.write(line)
f_hdl.close()
if __name__ == "__main__":
#######################################################################
# Parse arguments
#######################################################################
parser = argparse.ArgumentParser(prog='HAPPI_GWAS', description='HAPPI_GWAS')
parser.add_argument('-p', '--project_name', help='Project name', type=str, required=True)
parser.add_argument('-w', '--workflow_path', help='Workflow path', type=pathlib.Path, required=True)
parser.add_argument('-i', '--input_folder', help='Input folder', type=pathlib.Path, required=True)
parser.add_argument('-o', '--output_folder', help='Output folder', type=pathlib.Path, required=True)
parser.add_argument('-v', '--vcf_file', help='VCF file', type=pathlib.Path, required=True)
parser.add_argument('-g', '--gff_file', help='GFF file', type=pathlib.Path, required=True)
parser.add_argument('--gff_category', help='GFF category', type=str, default='gene')
parser.add_argument('--gff_key', help='GFF key', type=str, default='ID')
parser.add_argument('--genotype_hapmap', default='NULL', help='Genotype hapmap', type=str)
parser.add_argument('--genotype_data', default='NULL', help='Genotype data', type=str)
parser.add_argument('--genotype_map', default='NULL', help='Genotype map', type=str)
parser.add_argument('--kinship', default='NULL', type=str, help='Kinship matrix file')
parser.add_argument('--z_matrix', default='NULL', type=str, help='Z matrix file')
parser.add_argument('--corvariance_matrix', default='NULL', type=str, help='Corvariance matrix file')
parser.add_argument('--snp_maf', default=0.0, type=float, help='SNP minor allele frequency')
parser.add_argument('--model', default='MLM', type=str, help='Model')
parser.add_argument('--pca_total', default=0, type=int, help='Total PCA')
parser.add_argument('--ulimit', default=0, type=int, help='Ulimit')
parser.add_argument('--memory', default=20, type=int, help='Memory')
parser.add_argument('--threads', default=4, type=int, help='Threads')
parser.add_argument('--keep_going', action='store_true', help='Keep going')
parser.add_argument('--jobs', default=2, type=int, help='Jobs')
parser.add_argument('--latency_wait', default=180, type=int, help='Latency wait')
parser.add_argument('--cluster', default='', type=str, help='Cluster parameters')
parser.add_argument('--p_value_filter', default=1.0, type=float, help='P-value filter')
parser.add_argument('--fdr_corrected_p_value_filter', default=1.0, type=float, help='FDR corrected p-value filter')
parser.add_argument('--ld_length', default=10000, type=int, help='LD length')
args = parser.parse_args()
#######################################################################
# Call main function
#######################################################################
main(args)