Skip to content

Commit

Permalink
update changelog. prepare temp push
Browse files Browse the repository at this point in the history
  • Loading branch information
taoliu committed Oct 8, 2024
1 parent 954774f commit 263dcc6
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 45 deletions.
30 changes: 27 additions & 3 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
2024-10-08 Tao Liu <[email protected]>
MACS 3.0.3b

* Features added

1) We extensively rewrote the `pyx` codes into `py` codes. In
another words, we now apply the 'pure python style' with PEP-484
type annotations to our previous Cython style codes. So that, the
source codes can be more compatible to Python programming tools
such as `flake8`. During rewritting, we cleaned the source codes
even more, and removed unnecessary dependencies during
compilation.

* Bug fixed

1) Fix issues in big-endian system in `Parser.py` codes. Enable
big-endian support in `BAM.py` codes for accessig certain
alignment records that overlap with givin genomic
coordinates using BAM/BAI files.

* Doc

1) Explanation on the filtering criteria on SAM/BAM/BAMPE files.

2024-09-06 Tao Liu <[email protected]>
MACS 3.0.2

Expand Down Expand Up @@ -31,9 +55,9 @@

6) For gappedPeak output, set thickStart and thickEnd columns as
0, according to UCSC definition.

* Bugs fixed

1) Use `-O3` instead of `-Ofast` for compatibility. #637

* Documentation
Expand All @@ -46,7 +70,7 @@
3) Description on various file formats used in MACS3.

2024-02-19 Tao Liu <[email protected]>
MACS 3.0.1
MACS 3.0.1

* Bugs fixed

Expand Down
114 changes: 72 additions & 42 deletions MACS3/Commands/hmmratac_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2024-10-02 17:41:10 Tao Liu>
# Time-stamp: <2024-10-08 10:53:48 Tao Liu>


"""Description: Main HMMR command
Expand Down Expand Up @@ -58,25 +58,39 @@ def run(args):
#############################################
# 0. Names of output files
#############################################
short_bdgfile = os.path.join(options.outdir, options.name+"_digested_short.bdg")
mono_bdgfile = os.path.join(options.outdir, options.name+"_digested_mono.bdg")
di_bdgfile = os.path.join(options.outdir, options.name+"_digested_di.bdg")
tri_bdgfile = os.path.join(options.outdir, options.name+"_digested_tri.bdg")
training_region_bedfile = os.path.join(options.outdir, options.name+"_training_regions.bed")
training_datafile = os.path.join(options.outdir, options.name+"_training_data.txt")
training_datalengthfile = os.path.join(options.outdir, options.name+"_training_lengths.txt")

hmm_modelfile = os.path.join(options.outdir, options.name+"_model.json")

open_state_bdgfile = os.path.join(options.outdir, options.name+"_open.bdg")
nuc_state_bdgfile = os.path.join(options.outdir, options.name+"_nuc.bdg")
bg_state_bdgfile = os.path.join(options.outdir, options.name+"_bg.bdg")

states_file = os.path.join(options.outdir, options.name+"_states.bed")

accessible_file = os.path.join(options.outdir, options.name+"_accessible_regions.narrowPeak")

cutoffanalysis_file = os.path.join(options.outdir, options.name+"_cutoff_analysis.tsv")
short_bdgfile = os.path.join(options.outdir,
options.name+"_digested_short.bdg")
mono_bdgfile = os.path.join(options.outdir,
options.name+"_digested_mono.bdg")
di_bdgfile = os.path.join(options.outdir,
options.name+"_digested_di.bdg")
tri_bdgfile = os.path.join(options.outdir,
options.name+"_digested_tri.bdg")
training_region_bedfile = os.path.join(options.outdir,
options.name+"_training_regions.bed")
training_datafile = os.path.join(options.outdir,
options.name+"_training_data.txt")
training_datalengthfile = os.path.join(options.outdir,
options.name+"_training_lengths.txt")

hmm_modelfile = os.path.join(options.outdir,
options.name+"_model.json")

open_state_bdgfile = os.path.join(options.outdir,
options.name+"_open.bdg")
nuc_state_bdgfile = os.path.join(options.outdir,
options.name+"_nuc.bdg")
bg_state_bdgfile = os.path.join(options.outdir,
options.name+"_bg.bdg")

states_file = os.path.join(options.outdir,
options.name+"_states.bed")

accessible_file = os.path.join(options.outdir,
options.name+"_accessible_regions.narrowPeak")

cutoffanalysis_file = os.path.join(options.outdir,
options.name+"_cutoff_analysis.tsv")

#############################################
# 1. Read the input files
Expand Down Expand Up @@ -115,7 +129,10 @@ def run(args):
for l_p in peakio:
fs = l_p.rstrip().split()
i += 1
blacklist.add(fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i)
blacklist.add(fs[0].encode(),
int(fs[1]),
int(fs[2]),
name=b"%d" % i)
blacklist.sort()
blacklist_regions = Regions()
blacklist_regions.init_from_PeakIO(blacklist)
Expand All @@ -131,8 +148,9 @@ def run(args):
else:
# we will use EM to get the best means/stddevs for the mono-, di- and tri- modes of fragment sizes
options.info("#2 Use EM algorithm to estimate means and stddevs of fragment lengths")
options.info("# for mono-, di-, and tri-nucleosomal signals...")
em_trainer = HMMR_EM(petrack, options.em_means[1:4], options.em_stddevs[1:4], seed=options.hmm_randomSeed)
options.info("# for mono-, di-, and tri-nucleosomal signals...")
em_trainer = HMMR_EM(petrack, options.em_means[1:4],
options.em_stddevs[1:4], seed=options.hmm_randomSeed)
# the mean and stddev after EM training
em_means = [options.em_means[0],]
em_means.extend(em_trainer.fragMeans)
Expand All @@ -143,29 +161,30 @@ def run(args):
em_means[i] = round(em_means[i], 1)
for i in range(len(em_stddevs)):
em_stddevs[i] = round(em_stddevs[i], 1)
options.info(f"# The means and stddevs after EM:")
options.info("# The means and stddevs after EM:")

options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
options.info( "# means: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_means))
options.info( "# stddevs: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_stddevs))
options.info("# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
options.info("# means: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_means))
options.info("# stddevs: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_stddevs))

# to finalize the EM training, we will decompose ATAC-seq into four signal tracks
options.info(f"# Compute the weights for each fragment length for each of the four signal types")
options.info("# Compute the weights for each fragment length for each of the four signal types")
fl_dict = petrack.count_fraglengths()
fl_list = list(fl_dict.keys())
fl_list.sort()

# now we will prepare the weights for each fragment length for
# each of the four distributions based on the EM results
weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs, min_frag_p=options.min_frag_p)
weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs,
min_frag_p=options.min_frag_p)

options.info("# Generate short, mono-, di-, and tri-nucleosomal signals")
digested_atac_signals = generate_digested_signals(petrack, weight_mapping)

# save three types of signals if needed
if options.save_digested:
bdgshort = bedGraphIO(short_bdgfile, data=digested_atac_signals[0])
bdgshort.write_bedGraph("short","short")
bdgshort.write_bedGraph("short", "short")

bdgmono = bedGraphIO(mono_bdgfile, data=digested_atac_signals[1])
bdgmono.write_bedGraph("mono", "mono")
Expand All @@ -177,8 +196,9 @@ def run(args):
bdgtri.write_bedGraph("tri", "tri")

minlen = int(petrack.average_template_length)
# if options.pileup_short is on, we pile up only the short fragments to identify training
# regions and to prescan for candidate regions for decoding.
# if options.pileup_short is on, we pile up only the short
# fragments to identify training regions and to prescan for
# candidate regions for decoding.
if options.pileup_short:
options.info("# Pile up ONLY short fragments")
fc_bdg = digested_atac_signals[0]
Expand All @@ -195,9 +215,13 @@ def run(args):
options.info(f"#3 Generate cutoff analysis report from {petrack.total} fragments")
options.info(f"# Please review the cutoff analysis result in {cutoffanalysis_file}")

# Let MACS3 do the cutoff analysis to help decide the lower and upper cutoffs
# Let MACS3 do the cutoff analysis to help decide the lower
# and upper cutoffs
with open(cutoffanalysis_file, "w") as ofhd_cutoff:
ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen, max_gap=options.hmm_training_flanking, max_score=options.cutoff_analysis_max, steps=options.cutoff_analysis_steps))
ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen,
max_gap=options.hmm_training_flanking,
max_score=options.cutoff_analysis_max,
steps=options.cutoff_analysis_steps))
# raise Exception("Cutoff analysis only.")
sys.exit(1)

Expand All @@ -216,7 +240,7 @@ def run(args):
peaks = PeakIO()
for l_p in peakio:
fs = l_p.rstrip().split()
peaks.add(chromosome=fs[0], start=int(fs[1]), end=int(fs[2])) #change based on what expected input file should contain
peaks.add(chromosome=fs[0], start=int(fs[1]), end=int(fs[2])) # change based on what expected input file should contain
peakio.close()
training_regions = Regions()
training_regions.init_from_PeakIO(peaks)
Expand All @@ -228,7 +252,7 @@ def run(args):
options.info(f"#3 Look for training set from {petrack.total} fragments")
options.info(f"# Call peak above within fold-change range of {options.hmm_lower} and {options.hmm_upper}.")
options.info(f"# The minimum length of the region is set as the average template/fragment length in the dataset: {minlen}")
options.info(f"# The maximum gap to merge nearby significant regions is set as the flanking size to extend training regions: {options.hmm_training_flanking}")
options.info(f"# The maximum gap to merge nearby significant regions is set as the flanking size to extend training regions: {options.hmm_training_flanking}")
peaks = fc_bdg.call_peaks(cutoff=options.hmm_lower, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False)
options.info(f"# Total training regions called after applying the lower cutoff {options.hmm_lower}: {peaks.total}")
peaks.filter_score(options.hmm_lower, options.hmm_upper)
Expand Down Expand Up @@ -301,7 +325,10 @@ def run(args):

options.info("# Use Baum-Welch algorithm to train the HMM")

hmm_model = hmm_training(training_data, training_data_lengths, random_seed=options.hmm_randomSeed, hmm_type=options.hmm_type)
hmm_model = hmm_training(training_data,
training_data_lengths,
random_seed=options.hmm_randomSeed,
hmm_type=options.hmm_type)

options.info(f"# HMM converged: {hmm_model.monitor_.converged}")

Expand Down Expand Up @@ -334,7 +361,7 @@ def run(args):
assignments[i_open_region] = "open"
assignments[i_nucleosomal_region] = "nuc"
assignments[i_background_region] = "bg"

options.info(f"# The Hidden Markov Model for signals of binsize of {options.hmm_binsize} basepairs:")
options.info(f"# open state index: state{i_open_region}")
options.info(f"# nucleosomal state index: state{i_nucleosomal_region}")
Expand All @@ -360,7 +387,7 @@ def run(args):
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.lambdas_[0]))
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.lambdas_[1]))
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.lambdas_[2]))


#############################################
# 5. Predict
Expand Down Expand Up @@ -450,7 +477,7 @@ def run(args):
# # Generate states path:
states_path = generate_states_path(predicted_proba_file, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region)
options.info("# finished generating states path")
predicted_proba_file.close() #kill the temp file
predicted_proba_file.close() # kill the temp file
# Save states path if needed
# PS: we need to implement extra feature to include those regions NOT in candidate_bins and assign them as 'background state'.
if options.save_states:
Expand Down Expand Up @@ -581,8 +608,11 @@ def add_regions(i, regions):
# This by default is the only final output from HMMRATAC
accessible_regions = []
for i in range(len(states_path)-2):
if (states_path[i][3] == 'nuc' and states_path[i+1][3] == 'open' and states_path[i+2][3] == 'nuc' and
states_path[i][2] == states_path[i+1][1] and states_path[i+1][2] == states_path[i+2][1] and
if (states_path[i][3] == 'nuc' and
states_path[i+1][3] == 'open' and
states_path[i+2][3] == 'nuc' and
states_path[i][2] == states_path[i+1][1] and
states_path[i+1][2] == states_path[i+2][1] and
states_path[i+2][2] - states_path[i][1] > openregion_minlen): # require nuc-open-nuc entire region start/endpos > openregion_minlen
accessible_regions = add_regions(i, accessible_regions)

Expand Down

0 comments on commit 263dcc6

Please sign in to comment.