Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/macs3/python style cython (1st) #664

Merged
merged 11 commits into from
Oct 8, 2024
30 changes: 27 additions & 3 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
2024-10-08 Tao Liu <[email protected]>
MACS 3.0.3b

* Features added

1) We extensively rewrote the `pyx` codes into `py` codes. In
another words, we now apply the 'pure python style' with PEP-484
type annotations to our previous Cython style codes. So that, the
source codes can be more compatible to Python programming tools
such as `flake8`. During rewritting, we cleaned the source codes
even more, and removed unnecessary dependencies during
compilation.

* Bug fixed

1) Fix issues in big-endian system in `Parser.py` codes. Enable
big-endian support in `BAM.py` codes for accessig certain
alignment records that overlap with givin genomic
coordinates using BAM/BAI files.

* Doc

1) Explanation on the filtering criteria on SAM/BAM/BAMPE files.

2024-09-06 Tao Liu <[email protected]>
MACS 3.0.2

Expand Down Expand Up @@ -31,9 +55,9 @@

6) For gappedPeak output, set thickStart and thickEnd columns as
0, according to UCSC definition.

* Bugs fixed

1) Use `-O3` instead of `-Ofast` for compatibility. #637

* Documentation
Expand All @@ -46,7 +70,7 @@
3) Description on various file formats used in MACS3.

2024-02-19 Tao Liu <[email protected]>
MACS 3.0.1
MACS 3.0.1

* Bugs fixed

Expand Down
114 changes: 72 additions & 42 deletions MACS3/Commands/hmmratac_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2024-10-02 17:41:10 Tao Liu>
# Time-stamp: <2024-10-08 10:53:48 Tao Liu>


"""Description: Main HMMR command
Expand Down Expand Up @@ -58,25 +58,39 @@ def run(args):
#############################################
# 0. Names of output files
#############################################
short_bdgfile = os.path.join(options.outdir, options.name+"_digested_short.bdg")
mono_bdgfile = os.path.join(options.outdir, options.name+"_digested_mono.bdg")
di_bdgfile = os.path.join(options.outdir, options.name+"_digested_di.bdg")
tri_bdgfile = os.path.join(options.outdir, options.name+"_digested_tri.bdg")
training_region_bedfile = os.path.join(options.outdir, options.name+"_training_regions.bed")
training_datafile = os.path.join(options.outdir, options.name+"_training_data.txt")
training_datalengthfile = os.path.join(options.outdir, options.name+"_training_lengths.txt")

hmm_modelfile = os.path.join(options.outdir, options.name+"_model.json")

open_state_bdgfile = os.path.join(options.outdir, options.name+"_open.bdg")
nuc_state_bdgfile = os.path.join(options.outdir, options.name+"_nuc.bdg")
bg_state_bdgfile = os.path.join(options.outdir, options.name+"_bg.bdg")

states_file = os.path.join(options.outdir, options.name+"_states.bed")

accessible_file = os.path.join(options.outdir, options.name+"_accessible_regions.narrowPeak")

cutoffanalysis_file = os.path.join(options.outdir, options.name+"_cutoff_analysis.tsv")
short_bdgfile = os.path.join(options.outdir,
options.name+"_digested_short.bdg")
mono_bdgfile = os.path.join(options.outdir,
options.name+"_digested_mono.bdg")
di_bdgfile = os.path.join(options.outdir,
options.name+"_digested_di.bdg")
tri_bdgfile = os.path.join(options.outdir,
options.name+"_digested_tri.bdg")
training_region_bedfile = os.path.join(options.outdir,
options.name+"_training_regions.bed")
training_datafile = os.path.join(options.outdir,
options.name+"_training_data.txt")
training_datalengthfile = os.path.join(options.outdir,
options.name+"_training_lengths.txt")

hmm_modelfile = os.path.join(options.outdir,
options.name+"_model.json")

open_state_bdgfile = os.path.join(options.outdir,
options.name+"_open.bdg")
nuc_state_bdgfile = os.path.join(options.outdir,
options.name+"_nuc.bdg")
bg_state_bdgfile = os.path.join(options.outdir,
options.name+"_bg.bdg")

states_file = os.path.join(options.outdir,
options.name+"_states.bed")

accessible_file = os.path.join(options.outdir,
options.name+"_accessible_regions.narrowPeak")

cutoffanalysis_file = os.path.join(options.outdir,
options.name+"_cutoff_analysis.tsv")

#############################################
# 1. Read the input files
Expand Down Expand Up @@ -115,7 +129,10 @@ def run(args):
for l_p in peakio:
fs = l_p.rstrip().split()
i += 1
blacklist.add(fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i)
blacklist.add(fs[0].encode(),
int(fs[1]),
int(fs[2]),
name=b"%d" % i)
blacklist.sort()
blacklist_regions = Regions()
blacklist_regions.init_from_PeakIO(blacklist)
Expand All @@ -131,8 +148,9 @@ def run(args):
else:
# we will use EM to get the best means/stddevs for the mono-, di- and tri- modes of fragment sizes
options.info("#2 Use EM algorithm to estimate means and stddevs of fragment lengths")
options.info("# for mono-, di-, and tri-nucleosomal signals...")
em_trainer = HMMR_EM(petrack, options.em_means[1:4], options.em_stddevs[1:4], seed=options.hmm_randomSeed)
options.info("# for mono-, di-, and tri-nucleosomal signals...")
em_trainer = HMMR_EM(petrack, options.em_means[1:4],
options.em_stddevs[1:4], seed=options.hmm_randomSeed)
# the mean and stddev after EM training
em_means = [options.em_means[0],]
em_means.extend(em_trainer.fragMeans)
Expand All @@ -143,29 +161,30 @@ def run(args):
em_means[i] = round(em_means[i], 1)
for i in range(len(em_stddevs)):
em_stddevs[i] = round(em_stddevs[i], 1)
options.info(f"# The means and stddevs after EM:")
options.info("# The means and stddevs after EM:")

options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
options.info( "# means: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_means))
options.info( "# stddevs: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_stddevs))
options.info("# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
options.info("# means: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_means))
options.info("# stddevs: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_stddevs))

# to finalize the EM training, we will decompose ATAC-seq into four signal tracks
options.info(f"# Compute the weights for each fragment length for each of the four signal types")
options.info("# Compute the weights for each fragment length for each of the four signal types")
fl_dict = petrack.count_fraglengths()
fl_list = list(fl_dict.keys())
fl_list.sort()

# now we will prepare the weights for each fragment length for
# each of the four distributions based on the EM results
weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs, min_frag_p=options.min_frag_p)
weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs,
min_frag_p=options.min_frag_p)

options.info("# Generate short, mono-, di-, and tri-nucleosomal signals")
digested_atac_signals = generate_digested_signals(petrack, weight_mapping)

# save three types of signals if needed
if options.save_digested:
bdgshort = bedGraphIO(short_bdgfile, data=digested_atac_signals[0])
bdgshort.write_bedGraph("short","short")
bdgshort.write_bedGraph("short", "short")

bdgmono = bedGraphIO(mono_bdgfile, data=digested_atac_signals[1])
bdgmono.write_bedGraph("mono", "mono")
Expand All @@ -177,8 +196,9 @@ def run(args):
bdgtri.write_bedGraph("tri", "tri")

minlen = int(petrack.average_template_length)
# if options.pileup_short is on, we pile up only the short fragments to identify training
# regions and to prescan for candidate regions for decoding.
# if options.pileup_short is on, we pile up only the short
# fragments to identify training regions and to prescan for
# candidate regions for decoding.
if options.pileup_short:
options.info("# Pile up ONLY short fragments")
fc_bdg = digested_atac_signals[0]
Expand All @@ -195,9 +215,13 @@ def run(args):
options.info(f"#3 Generate cutoff analysis report from {petrack.total} fragments")
options.info(f"# Please review the cutoff analysis result in {cutoffanalysis_file}")

# Let MACS3 do the cutoff analysis to help decide the lower and upper cutoffs
# Let MACS3 do the cutoff analysis to help decide the lower
# and upper cutoffs
with open(cutoffanalysis_file, "w") as ofhd_cutoff:
ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen, max_gap=options.hmm_training_flanking, max_score=options.cutoff_analysis_max, steps=options.cutoff_analysis_steps))
ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen,
max_gap=options.hmm_training_flanking,
max_score=options.cutoff_analysis_max,
steps=options.cutoff_analysis_steps))
# raise Exception("Cutoff analysis only.")
sys.exit(1)

Expand All @@ -216,7 +240,7 @@ def run(args):
peaks = PeakIO()
for l_p in peakio:
fs = l_p.rstrip().split()
peaks.add(chromosome=fs[0], start=int(fs[1]), end=int(fs[2])) #change based on what expected input file should contain
peaks.add(chromosome=fs[0], start=int(fs[1]), end=int(fs[2])) # change based on what expected input file should contain
peakio.close()
training_regions = Regions()
training_regions.init_from_PeakIO(peaks)
Expand All @@ -228,7 +252,7 @@ def run(args):
options.info(f"#3 Look for training set from {petrack.total} fragments")
options.info(f"# Call peak above within fold-change range of {options.hmm_lower} and {options.hmm_upper}.")
options.info(f"# The minimum length of the region is set as the average template/fragment length in the dataset: {minlen}")
options.info(f"# The maximum gap to merge nearby significant regions is set as the flanking size to extend training regions: {options.hmm_training_flanking}")
options.info(f"# The maximum gap to merge nearby significant regions is set as the flanking size to extend training regions: {options.hmm_training_flanking}")
peaks = fc_bdg.call_peaks(cutoff=options.hmm_lower, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False)
options.info(f"# Total training regions called after applying the lower cutoff {options.hmm_lower}: {peaks.total}")
peaks.filter_score(options.hmm_lower, options.hmm_upper)
Expand Down Expand Up @@ -301,7 +325,10 @@ def run(args):

options.info("# Use Baum-Welch algorithm to train the HMM")

hmm_model = hmm_training(training_data, training_data_lengths, random_seed=options.hmm_randomSeed, hmm_type=options.hmm_type)
hmm_model = hmm_training(training_data,
training_data_lengths,
random_seed=options.hmm_randomSeed,
hmm_type=options.hmm_type)

options.info(f"# HMM converged: {hmm_model.monitor_.converged}")

Expand Down Expand Up @@ -334,7 +361,7 @@ def run(args):
assignments[i_open_region] = "open"
assignments[i_nucleosomal_region] = "nuc"
assignments[i_background_region] = "bg"

options.info(f"# The Hidden Markov Model for signals of binsize of {options.hmm_binsize} basepairs:")
options.info(f"# open state index: state{i_open_region}")
options.info(f"# nucleosomal state index: state{i_nucleosomal_region}")
Expand All @@ -360,7 +387,7 @@ def run(args):
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.lambdas_[0]))
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.lambdas_[1]))
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.lambdas_[2]))


#############################################
# 5. Predict
Expand Down Expand Up @@ -450,7 +477,7 @@ def run(args):
# # Generate states path:
states_path = generate_states_path(predicted_proba_file, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region)
options.info("# finished generating states path")
predicted_proba_file.close() #kill the temp file
predicted_proba_file.close() # kill the temp file
# Save states path if needed
# PS: we need to implement extra feature to include those regions NOT in candidate_bins and assign them as 'background state'.
if options.save_states:
Expand Down Expand Up @@ -581,8 +608,11 @@ def add_regions(i, regions):
# This by default is the only final output from HMMRATAC
accessible_regions = []
for i in range(len(states_path)-2):
if (states_path[i][3] == 'nuc' and states_path[i+1][3] == 'open' and states_path[i+2][3] == 'nuc' and
states_path[i][2] == states_path[i+1][1] and states_path[i+1][2] == states_path[i+2][1] and
if (states_path[i][3] == 'nuc' and
states_path[i+1][3] == 'open' and
states_path[i+2][3] == 'nuc' and
states_path[i][2] == states_path[i+1][1] and
states_path[i+1][2] == states_path[i+2][1] and
states_path[i+2][2] - states_path[i][1] > openregion_minlen): # require nuc-open-nuc entire region start/endpos > openregion_minlen
accessible_regions = add_regions(i, accessible_regions)

Expand Down
Loading