macs3-project · taoliu · Feb 18, 2025 · Feb 15, 2025 · Feb 17, 2025
diff --git a/ChangeLog b/ChangeLog
@@ -1,17 +1,24 @@
-2025-02-12  Tao Liu  <[email protected]>
+2025-02-14  Tao Liu  <[email protected]>
 	MACS 3.0.3
 
 	* Features added
 
-	1) We implemented the IO module for reading the fragment files
-	usually used in single-cell ATAC-seq experiment
-	`Parser.FragParser`. And we implemented a new
+	1) FRAG format introduced. FRAG format is for fragment files
+	defined by 10x genomics to store the alignments from single-cell
+	ATAC-seq experiment. It can be regarded as the BEDPE format with
+	two extra columns -- the barcode information and the counts of the
+	fragments aligned to the same location with the same
+	barcode. Currently, `callpeak` and `pileup` both support the new
+	format. We will add support for other functions such as `hmmratac`
+	in the future.
+
+	We implemented the IO module for reading the fragment files in
+	`Parser.FragParser`. And we then implemented a new
 	`PairedEndTrack.PETrackII` to store the data in fragment file,
 	including the barcodes and counts information. In the `PETrackII`
 	class, we are able to extract a subset using a list of barcodes,
 	which enables us to call peaks only on a pool (pseudo-bulk) of
-	cells. Functions such as `callpeak`, `pileup` and `hmmratac` will
-	support this FRAG format in the future.
+	cells.
 
 	2) We extensively rewrote the `pyx` codes into `py` codes. In
 	another words, we now apply the 'pure python style' with PEP-484

diff --git a/MACS3/Commands/callpeak_cmd.py b/MACS3/Commands/callpeak_cmd.py
@@ -1,4 +1,4 @@
-# Time-stamp: <2024-11-29 21:43:31 Tao Liu>
+# Time-stamp: <2025-02-15 14:31:26 Tao Liu>
 
 """Description: MACS 3 call peak main executable
 
@@ -82,6 +82,19 @@ def run(args):
     tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
     info("#1  total %ss in treatment: %d", tag, t0)
 
+    # subset of fragments for FRAG format and given barcodes
+    if options.format == "FRAG" and options.barcodefile:
+        info("#1 extract fragments with given barcodes")
+        barcodes_set = set()
+        with open(options.barcodefile, "r") as bfhd:
+            for l in bfhd:
+                barcodes_set.add(l.rstrip().encode())
+        treat = treat.subset(barcodes_set)
+        info("#   extracted %d fragments in treatment", treat.total)
+        if control is not None:
+            control = control.subset(barcodes_set)
+            info("#   extracted %d fragments in control", treat.total)
+
     # handle duplicates
     if options.keepduplicates != "all":
         if options.keepduplicates == "auto":
@@ -332,30 +345,41 @@ def load_frag_files_options(options):
     options.info("#1 read treatment fragments...")
 
     tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
-    treat = tp.build_petrack()
+    if options.format == "FRAG" and options.maxcount:
+        treat = tp.build_petrack(max_count=options.maxcount)
+    else:
+        treat = tp.build_petrack()
     if len(options.tfile) > 1:
         # multiple input
         for tfile in options.tfile[1:]:
             tp = options.parser(tfile, buffer_size=options.buffer_size)
-            treat = tp.append_petrack(treat)
+            if options.format == "FRAG" and options.maxcount:
+                treat = tp.append_petrack(treat, max_count=options.maxcount)
+            else:
+                treat = tp.append_petrack(treat)
     treat.finalize()
 
     options.tsize = tp.d
     if options.cfile:
         options.info("#1.2 read input fragments...")
         cp = options.parser(options.cfile[0], buffer_size=options.buffer_size)
-        control = cp.build_petrack()
+        if options.format == "FRAG" and options.maxcount:
+            control = cp.build_petrack(max_count=options.maxcount)
+        else:
+            control = cp.build_petrack()
         control_d = cp.d
         if len(options.cfile) > 1:
             # multiple input
             for cfile in options.cfile[1:]:
                 cp = options.parser(cfile, buffer_size=options.buffer_size)
-                control = cp.append_petrack(control)
+                if options.format == "FRAG" and options.maxcount:
+                    control = cp.append_petrack(control, max_count=options.maxcount)
+                else:
+                    control = cp.append_petrack(control)
         control.finalize()
     else:
         control = None
     options.info("#1 mean fragment size is determined as %.1f bp from treatment" % options.tsize)
-    # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
     if control is not None:
         options.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d)
     return (treat, control)

diff --git a/MACS3/Commands/pileup_cmd.py b/MACS3/Commands/pileup_cmd.py
@@ -4,7 +4,7 @@
 under the terms of the BSD License (see the file LICENSE included with
 the distribution).
 """
-# Time-stamp: <2025-02-05 12:39:14 Tao Liu>
+# Time-stamp: <2025-02-15 14:33:43 Tao Liu>
 
 # ------------------------------------
 # python modules
@@ -52,6 +52,17 @@ def run(o_options):
         treat = load_frag_files_options(options)  # return PETrackI object
         t0 = treat.total                          # total fragments
         info("# total fragments/pairs in alignment file: %d" % (t0))
+
+        # subset of fragments for FRAG format and given barcodes
+        if options.format == "FRAG" and options.barcodefile:
+            info("# extract fragments with given barcodes")
+            barcodes_set = set()
+            with open(options.barcodefile, "r") as bfhd:
+                for l in bfhd:
+                    barcodes_set.add(l.rstrip().encode())
+            treat = treat.subset(barcodes_set)
+            info("#   extracted %d fragments", treat.total)
+
         info("# Pileup paired-end alignment file.")
         bdg = treat.pileup_bdg()
         bdgio = bedGraphIO(outfile, data=bdg)
@@ -78,7 +89,7 @@ def load_tag_files_options(options):
     """From the options, load alignment tags.
 
     """
-    options.info("# read treatment tags...")
+    options.info("# read tags...")
     tp = options.parser(options.ifile[0], buffer_size=options.buffer_size)
     tsize = tp.tsize()
     treat = tp.build_fwtrack()
@@ -99,16 +110,22 @@ def load_frag_files_options(options):
     """From the options, load treatment fragments and control fragments (if available).
 
     """
-    options.info("# read treatment fragments...")
+    options.info("# read fragments...")
 
     tp = options.parser(options.ifile[0], buffer_size=options.buffer_size)
-    treat = tp.build_petrack()
+    if options.format == "FRAG" and options.maxcount:
+        treat = tp.build_petrack(max_count=options.maxcount)
+    else:
+        treat = tp.build_petrack()
     # treat.sort()
     if len(options.ifile) > 1:
         # multiple input
         for tfile in options.ifile[1:]:
             tp = options.parser(tfile, buffer_size=options.buffer_size)
-            treat = tp.append_petrack(treat)
+            if options.format == "FRAG" and options.maxcount:
+                treat = tp.append_petrack(treat, max_count=options.maxcount)
+            else:
+                treat = tp.append_petrack(treat)
             # treat.sort()
     treat.finalize()
     return treat
diff --git a/MACS3/IO/Parser.py b/MACS3/IO/Parser.py
@@ -1,7 +1,7 @@
 # cython: language_level=3
 # cython: profile=True
 # cython: linetrace=True
-# Time-stamp: <2024-11-29 23:30:03 Tao Liu>
+# Time-stamp: <2025-02-15 08:24:53 Tao Liu>
 
 """Module for all MACS Parser classes for input. Please note that the
 parsers are for reading the alignment files ONLY.
@@ -249,7 +249,7 @@ def __init__(self, filename: str, buffer_size: cython.long = 100000):
             self.fhd = io.BufferedReader(gzip.open(filename, mode='rb'),
                                          buffer_size=READ_BUFFER_SIZE)
         else:
-            # binary mode! I don't expect unicode here!            
+            # binary mode! I don't expect unicode here!
             self.fhd = io.open(filename, mode='rb')
         self.skip_first_commentlines()
 
@@ -1529,11 +1529,7 @@ def pe_parse_line(self, thisline: bytes):
                             thisline)
 
     @cython.ccall
-    def build_petrack(self):
-        return self.build_petrack2()
-
-    @cython.ccall
-    def build_petrack2(self):
+    def build_petrack(self, max_count=0):
         """Build PETrackII from all lines.
 
         """
@@ -1565,6 +1561,8 @@ def build_petrack2(self):
                 i += 1
                 if i % 1000000 == 0:
                     info(" %d fragments parsed" % i)
+                if max_count:
+                    count = min(count, max_count)
                 add_loc(chromosome, left_pos, right_pos, barcode, count)
         # last one
         if tmp:
@@ -1573,6 +1571,8 @@ def build_petrack2(self):
                 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
                 i += 1
                 m += right_pos - left_pos
+                if max_count:
+                    count = min(count, max_count)                
                 add_loc(chromosome, left_pos, right_pos, barcode, count)
 
         self.d = cython.cast(cython.float, m) / i
@@ -1584,7 +1584,7 @@ def build_petrack2(self):
         return petrack
 
     @cython.ccall
-    def append_petrack(self, petrack):
+    def append_petrack(self, petrack, max_count=0):
         """Build PETrackI from all lines, return a PETrackI object.
         """
         chromosome: bytes
@@ -1613,6 +1613,8 @@ def append_petrack(self, petrack):
                 i += 1
                 if i % 1000000 == 0:
                     info(" %d fragments parsed" % i)
+                if max_count:
+                    count = min(count, max_count)                    
                 add_loc(chromosome, left_pos, right_pos, barcode, count)
         # last one
         if tmp:
@@ -1621,6 +1623,8 @@ def append_petrack(self, petrack):
                 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
                 i += 1
                 m += right_pos - left_pos
+                if max_count:
+                    count = min(count, max_count)                
                 add_loc(chromosome, left_pos, right_pos, barcode, count)
 
         self.d = (self.d * self.n + m) / (self.n + i)

diff --git a/MACS3/Signal/CallPeakUnit.py b/MACS3/Signal/CallPeakUnit.py
@@ -1,7 +1,7 @@
 # cython: language_level=3
 # cython: profile=True
 # cython: linetrace=True
-# Time-stamp: <2025-02-05 12:40:36 Tao Liu>
+# Time-stamp: <2025-02-14 12:08:53 Tao Liu>
 
 """Module for Calculate Scores.
 
@@ -622,9 +622,9 @@ def __chrom_pair_treat_ctrl(self, treat_pv, ctrl_pv) -> list:
 
         return [p, t, c] list, each element is a numpy array.
         """
-        index_ret: cython.long
-        it: cython.long
-        ic: cython.long
+        ir: cython.long         # index of ret_p/t/c
+        it: cython.long         # index of t_p/v
+        ic: cython.long         # index of c_p/v
         lt: cython.long
         lc: cython.long
         t_p: cnp.ndarray
@@ -634,13 +634,6 @@ def __chrom_pair_treat_ctrl(self, treat_pv, ctrl_pv) -> list:
         c_v: cnp.ndarray
         ret_t: cnp.ndarray
         ret_c: cnp.ndarray
-        t_p_view: cython.pointer(cython.int)
-        c_p_view: cython.pointer(cython.int)
-        ret_p_view: cython.pointer(cython.int)
-        t_v_view: cython.pointer(cython.float)
-        c_v_view: cython.pointer(cython.float)
-        ret_t_view: cython.pointer(cython.float)
-        ret_c_view: cython.pointer(cython.float)
 
         [t_p, t_v] = treat_pv
         [c_p, c_v] = ctrl_pv
@@ -654,73 +647,40 @@ def __chrom_pair_treat_ctrl(self, treat_pv, ctrl_pv) -> list:
         ret_t = np.zeros(chrom_max_len, dtype="f4")  # value from treatment
         ret_c = np.zeros(chrom_max_len, dtype="f4")  # value from control
 
-        # t_p_view = t_p #cython.cast(cython.pointer[cython.int], t_p.data)
-        # t_v_view = t_v #cython.cast(cython.pointer[cython.float], t_v.data)
-        # c_p_view = c_p #cython.cast(cython.pointer[cython.int], c_p.data)
-        # c_v_view = c_v #cython.cast(cython.pointer[cython.float], c_v.data)
-        # ret_p_view = ret_p #cython.cast(cython.pointer[cython.int], ret_p.data)
-        # ret_t_view = ret_t #cython.cast(cython.pointer[cython.float], ret_t.data)
-        # ret_c_view = ret_c #cython.cast(cython.pointer[cython.float], ret_c.data)
-
-        t_p_view = cython.cast(cython.pointer(cython.int), t_p.data)
-        t_v_view = cython.cast(cython.pointer(cython.float), t_v.data)
-        c_p_view = cython.cast(cython.pointer(cython.int), c_p.data)
-        c_v_view = cython.cast(cython.pointer(cython.float), c_v.data)
-        ret_p_view = cython.cast(cython.pointer(cython.int), ret_p.data)
-        ret_t_view = cython.cast(cython.pointer(cython.float), ret_t.data)
-        ret_c_view = cython.cast(cython.pointer(cython.float), ret_c.data)
-
-        index_ret = 0
+        ir = 0
         it = 0
         ic = 0
 
         while it < lt and ic < lc:
-            if t_p_view[0] < c_p_view[0]:
+            if t_p[it] < c_p[ic]:
                 # clip a region from pre_p to p1, then pre_p: set as p1.
-                ret_p_view[0] = t_p_view[0]
-                ret_t_view[0] = t_v_view[0]
-                ret_c_view[0] = c_v_view[0]
-                ret_p_view += 1
-                ret_t_view += 1
-                ret_c_view += 1
-                index_ret += 1
+                ret_p[ir] = t_p[it]
+                ret_t[ir] = t_v[it]
+                ret_c[ir] = c_v[ic]
+                ir += 1
                 # call for the next p1 and v1
                 it += 1
-                t_p_view += 1
-                t_v_view += 1
-            elif t_p_view[0] > c_p_view[0]:
+            elif t_p[it] > c_p[ic]:
                 # clip a region from pre_p to p2, then pre_p: set as p2.
-                ret_p_view[0] = c_p_view[0]
-                ret_t_view[0] = t_v_view[0]
-                ret_c_view[0] = c_v_view[0]
-                ret_p_view += 1
-                ret_t_view += 1
-                ret_c_view += 1
-                index_ret += 1
+                ret_p[ir] = c_p[ic]
+                ret_t[ir] = t_v[it]
+                ret_c[ir] = c_v[ic]
+                ir += 1
                 # call for the next p2 and v2
                 ic += 1
-                c_p_view += 1
-                c_v_view += 1
             else:
                 # from pre_p to p1 or p2, then pre_p: set as p1 or p2.
-                ret_p_view[0] = t_p_view[0]
-                ret_t_view[0] = t_v_view[0]
-                ret_c_view[0] = c_v_view[0]
-                ret_p_view += 1
-                ret_t_view += 1
-                ret_c_view += 1
-                index_ret += 1
+                ret_p[ir] = t_p[it]
+                ret_t[ir] = t_v[it]
+                ret_c[ir] = c_v[ic]
+                ir += 1
                 # call for the next p1, v1, p2, v2.
                 it += 1
                 ic += 1
-                t_p_view += 1
-                t_v_view += 1
-                c_p_view += 1
-                c_v_view += 1
-
-        ret_p.resize(index_ret, refcheck=False)
-        ret_t.resize(index_ret, refcheck=False)
-        ret_c.resize(index_ret, refcheck=False)
+
+        ret_p.resize(ir, refcheck=False)
+        ret_t.resize(ir, refcheck=False)
+        ret_c.resize(ir, refcheck=False)
         return [ret_p, ret_t, ret_c]
 
     @cython.cfunc