Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/macs3/fragsupport #690

Merged
merged 2 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
2025-02-12 Tao Liu <[email protected]>
2025-02-14 Tao Liu <[email protected]>
MACS 3.0.3

* Features added

1) We implemented the IO module for reading the fragment files
usually used in single-cell ATAC-seq experiment
`Parser.FragParser`. And we implemented a new
1) FRAG format introduced. FRAG format is for fragment files
defined by 10x genomics to store the alignments from single-cell
ATAC-seq experiment. It can be regarded as the BEDPE format with
two extra columns -- the barcode information and the counts of the
fragments aligned to the same location with the same
barcode. Currently, `callpeak` and `pileup` both support the new
format. We will add support for other functions such as `hmmratac`
in the future.

We implemented the IO module for reading the fragment files in
`Parser.FragParser`. And we then implemented a new
`PairedEndTrack.PETrackII` to store the data in fragment file,
including the barcodes and counts information. In the `PETrackII`
class, we are able to extract a subset using a list of barcodes,
which enables us to call peaks only on a pool (pseudo-bulk) of
cells. Functions such as `callpeak`, `pileup` and `hmmratac` will
support this FRAG format in the future.
cells.

2) We extensively rewrote the `pyx` codes into `py` codes. In
another words, we now apply the 'pure python style' with PEP-484
Expand Down
36 changes: 30 additions & 6 deletions MACS3/Commands/callpeak_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2024-11-29 21:43:31 Tao Liu>
# Time-stamp: <2025-02-15 14:31:26 Tao Liu>

"""Description: MACS 3 call peak main executable

Expand Down Expand Up @@ -82,6 +82,19 @@ def run(args):
tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
info("#1 total %ss in treatment: %d", tag, t0)

# subset of fragments for FRAG format and given barcodes
if options.format == "FRAG" and options.barcodefile:
info("#1 extract fragments with given barcodes")
barcodes_set = set()
with open(options.barcodefile, "r") as bfhd:
for l in bfhd:
barcodes_set.add(l.rstrip().encode())
treat = treat.subset(barcodes_set)
info("# extracted %d fragments in treatment", treat.total)
if control is not None:
control = control.subset(barcodes_set)
info("# extracted %d fragments in control", treat.total)

# handle duplicates
if options.keepduplicates != "all":
if options.keepduplicates == "auto":
Expand Down Expand Up @@ -332,30 +345,41 @@ def load_frag_files_options(options):
options.info("#1 read treatment fragments...")

tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
treat = tp.build_petrack()
if options.format == "FRAG" and options.maxcount:
treat = tp.build_petrack(max_count=options.maxcount)
else:
treat = tp.build_petrack()
if len(options.tfile) > 1:
# multiple input
for tfile in options.tfile[1:]:
tp = options.parser(tfile, buffer_size=options.buffer_size)
treat = tp.append_petrack(treat)
if options.format == "FRAG" and options.maxcount:
treat = tp.append_petrack(treat, max_count=options.maxcount)
else:
treat = tp.append_petrack(treat)
treat.finalize()

options.tsize = tp.d
if options.cfile:
options.info("#1.2 read input fragments...")
cp = options.parser(options.cfile[0], buffer_size=options.buffer_size)
control = cp.build_petrack()
if options.format == "FRAG" and options.maxcount:
control = cp.build_petrack(max_count=options.maxcount)
else:
control = cp.build_petrack()
control_d = cp.d
if len(options.cfile) > 1:
# multiple input
for cfile in options.cfile[1:]:
cp = options.parser(cfile, buffer_size=options.buffer_size)
control = cp.append_petrack(control)
if options.format == "FRAG" and options.maxcount:
control = cp.append_petrack(control, max_count=options.maxcount)
else:
control = cp.append_petrack(control)
control.finalize()
else:
control = None
options.info("#1 mean fragment size is determined as %.1f bp from treatment" % options.tsize)
# options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
if control is not None:
options.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d)
return (treat, control)
Expand Down
27 changes: 22 additions & 5 deletions MACS3/Commands/pileup_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
under the terms of the BSD License (see the file LICENSE included with
the distribution).
"""
# Time-stamp: <2025-02-05 12:39:14 Tao Liu>
# Time-stamp: <2025-02-15 14:33:43 Tao Liu>

# ------------------------------------
# python modules
Expand Down Expand Up @@ -52,6 +52,17 @@ def run(o_options):
treat = load_frag_files_options(options) # return PETrackI object
t0 = treat.total # total fragments
info("# total fragments/pairs in alignment file: %d" % (t0))

# subset of fragments for FRAG format and given barcodes
if options.format == "FRAG" and options.barcodefile:
info("# extract fragments with given barcodes")
barcodes_set = set()
with open(options.barcodefile, "r") as bfhd:
for l in bfhd:
barcodes_set.add(l.rstrip().encode())
treat = treat.subset(barcodes_set)
info("# extracted %d fragments", treat.total)

info("# Pileup paired-end alignment file.")
bdg = treat.pileup_bdg()
bdgio = bedGraphIO(outfile, data=bdg)
Expand All @@ -78,7 +89,7 @@ def load_tag_files_options(options):
"""From the options, load alignment tags.

"""
options.info("# read treatment tags...")
options.info("# read tags...")
tp = options.parser(options.ifile[0], buffer_size=options.buffer_size)
tsize = tp.tsize()
treat = tp.build_fwtrack()
Expand All @@ -99,16 +110,22 @@ def load_frag_files_options(options):
"""From the options, load treatment fragments and control fragments (if available).

"""
options.info("# read treatment fragments...")
options.info("# read fragments...")

tp = options.parser(options.ifile[0], buffer_size=options.buffer_size)
treat = tp.build_petrack()
if options.format == "FRAG" and options.maxcount:
treat = tp.build_petrack(max_count=options.maxcount)
else:
treat = tp.build_petrack()
# treat.sort()
if len(options.ifile) > 1:
# multiple input
for tfile in options.ifile[1:]:
tp = options.parser(tfile, buffer_size=options.buffer_size)
treat = tp.append_petrack(treat)
if options.format == "FRAG" and options.maxcount:
treat = tp.append_petrack(treat, max_count=options.maxcount)
else:
treat = tp.append_petrack(treat)
# treat.sort()
treat.finalize()
return treat
20 changes: 12 additions & 8 deletions MACS3/IO/Parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# cython: language_level=3
# cython: profile=True
# cython: linetrace=True
# Time-stamp: <2024-11-29 23:30:03 Tao Liu>
# Time-stamp: <2025-02-15 08:24:53 Tao Liu>

"""Module for all MACS Parser classes for input. Please note that the
parsers are for reading the alignment files ONLY.
Expand Down Expand Up @@ -249,7 +249,7 @@ def __init__(self, filename: str, buffer_size: cython.long = 100000):
self.fhd = io.BufferedReader(gzip.open(filename, mode='rb'),
buffer_size=READ_BUFFER_SIZE)
else:
# binary mode! I don't expect unicode here!
# binary mode! I don't expect unicode here!
self.fhd = io.open(filename, mode='rb')
self.skip_first_commentlines()

Expand Down Expand Up @@ -1529,11 +1529,7 @@ def pe_parse_line(self, thisline: bytes):
thisline)

@cython.ccall
def build_petrack(self):
return self.build_petrack2()

@cython.ccall
def build_petrack2(self):
def build_petrack(self, max_count=0):
"""Build PETrackII from all lines.

"""
Expand Down Expand Up @@ -1565,6 +1561,8 @@ def build_petrack2(self):
i += 1
if i % 1000000 == 0:
info(" %d fragments parsed" % i)
if max_count:
count = min(count, max_count)
add_loc(chromosome, left_pos, right_pos, barcode, count)
# last one
if tmp:
Expand All @@ -1573,6 +1571,8 @@ def build_petrack2(self):
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
i += 1
m += right_pos - left_pos
if max_count:
count = min(count, max_count)
add_loc(chromosome, left_pos, right_pos, barcode, count)

self.d = cython.cast(cython.float, m) / i
Expand All @@ -1584,7 +1584,7 @@ def build_petrack2(self):
return petrack

@cython.ccall
def append_petrack(self, petrack):
def append_petrack(self, petrack, max_count=0):
"""Build PETrackI from all lines, return a PETrackI object.
"""
chromosome: bytes
Expand Down Expand Up @@ -1613,6 +1613,8 @@ def append_petrack(self, petrack):
i += 1
if i % 1000000 == 0:
info(" %d fragments parsed" % i)
if max_count:
count = min(count, max_count)
add_loc(chromosome, left_pos, right_pos, barcode, count)
# last one
if tmp:
Expand All @@ -1621,6 +1623,8 @@ def append_petrack(self, petrack):
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
i += 1
m += right_pos - left_pos
if max_count:
count = min(count, max_count)
add_loc(chromosome, left_pos, right_pos, barcode, count)

self.d = (self.d * self.n + m) / (self.n + i)
Expand Down
86 changes: 23 additions & 63 deletions MACS3/Signal/CallPeakUnit.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# cython: language_level=3
# cython: profile=True
# cython: linetrace=True
# Time-stamp: <2025-02-05 12:40:36 Tao Liu>
# Time-stamp: <2025-02-14 12:08:53 Tao Liu>

"""Module for Calculate Scores.

Expand Down Expand Up @@ -622,9 +622,9 @@ def __chrom_pair_treat_ctrl(self, treat_pv, ctrl_pv) -> list:

return [p, t, c] list, each element is a numpy array.
"""
index_ret: cython.long
it: cython.long
ic: cython.long
ir: cython.long # index of ret_p/t/c
it: cython.long # index of t_p/v
ic: cython.long # index of c_p/v
lt: cython.long
lc: cython.long
t_p: cnp.ndarray
Expand All @@ -634,13 +634,6 @@ def __chrom_pair_treat_ctrl(self, treat_pv, ctrl_pv) -> list:
c_v: cnp.ndarray
ret_t: cnp.ndarray
ret_c: cnp.ndarray
t_p_view: cython.pointer(cython.int)
c_p_view: cython.pointer(cython.int)
ret_p_view: cython.pointer(cython.int)
t_v_view: cython.pointer(cython.float)
c_v_view: cython.pointer(cython.float)
ret_t_view: cython.pointer(cython.float)
ret_c_view: cython.pointer(cython.float)

[t_p, t_v] = treat_pv
[c_p, c_v] = ctrl_pv
Expand All @@ -654,73 +647,40 @@ def __chrom_pair_treat_ctrl(self, treat_pv, ctrl_pv) -> list:
ret_t = np.zeros(chrom_max_len, dtype="f4") # value from treatment
ret_c = np.zeros(chrom_max_len, dtype="f4") # value from control

# t_p_view = t_p #cython.cast(cython.pointer[cython.int], t_p.data)
# t_v_view = t_v #cython.cast(cython.pointer[cython.float], t_v.data)
# c_p_view = c_p #cython.cast(cython.pointer[cython.int], c_p.data)
# c_v_view = c_v #cython.cast(cython.pointer[cython.float], c_v.data)
# ret_p_view = ret_p #cython.cast(cython.pointer[cython.int], ret_p.data)
# ret_t_view = ret_t #cython.cast(cython.pointer[cython.float], ret_t.data)
# ret_c_view = ret_c #cython.cast(cython.pointer[cython.float], ret_c.data)

t_p_view = cython.cast(cython.pointer(cython.int), t_p.data)
t_v_view = cython.cast(cython.pointer(cython.float), t_v.data)
c_p_view = cython.cast(cython.pointer(cython.int), c_p.data)
c_v_view = cython.cast(cython.pointer(cython.float), c_v.data)
ret_p_view = cython.cast(cython.pointer(cython.int), ret_p.data)
ret_t_view = cython.cast(cython.pointer(cython.float), ret_t.data)
ret_c_view = cython.cast(cython.pointer(cython.float), ret_c.data)

index_ret = 0
ir = 0
it = 0
ic = 0

while it < lt and ic < lc:
if t_p_view[0] < c_p_view[0]:
if t_p[it] < c_p[ic]:
# clip a region from pre_p to p1, then pre_p: set as p1.
ret_p_view[0] = t_p_view[0]
ret_t_view[0] = t_v_view[0]
ret_c_view[0] = c_v_view[0]
ret_p_view += 1
ret_t_view += 1
ret_c_view += 1
index_ret += 1
ret_p[ir] = t_p[it]
ret_t[ir] = t_v[it]
ret_c[ir] = c_v[ic]
ir += 1
# call for the next p1 and v1
it += 1
t_p_view += 1
t_v_view += 1
elif t_p_view[0] > c_p_view[0]:
elif t_p[it] > c_p[ic]:
# clip a region from pre_p to p2, then pre_p: set as p2.
ret_p_view[0] = c_p_view[0]
ret_t_view[0] = t_v_view[0]
ret_c_view[0] = c_v_view[0]
ret_p_view += 1
ret_t_view += 1
ret_c_view += 1
index_ret += 1
ret_p[ir] = c_p[ic]
ret_t[ir] = t_v[it]
ret_c[ir] = c_v[ic]
ir += 1
# call for the next p2 and v2
ic += 1
c_p_view += 1
c_v_view += 1
else:
# from pre_p to p1 or p2, then pre_p: set as p1 or p2.
ret_p_view[0] = t_p_view[0]
ret_t_view[0] = t_v_view[0]
ret_c_view[0] = c_v_view[0]
ret_p_view += 1
ret_t_view += 1
ret_c_view += 1
index_ret += 1
ret_p[ir] = t_p[it]
ret_t[ir] = t_v[it]
ret_c[ir] = c_v[ic]
ir += 1
# call for the next p1, v1, p2, v2.
it += 1
ic += 1
t_p_view += 1
t_v_view += 1
c_p_view += 1
c_v_view += 1

ret_p.resize(index_ret, refcheck=False)
ret_t.resize(index_ret, refcheck=False)
ret_c.resize(index_ret, refcheck=False)

ret_p.resize(ir, refcheck=False)
ret_t.resize(ir, refcheck=False)
ret_c.resize(ir, refcheck=False)
return [ret_p, ret_t, ret_c]

@cython.cfunc
Expand Down
Loading
Loading