Merge branch 'master' into dedup_update

open2c · Mar 9, 2024 · 2fb3690 · 2fb3690
2 parents 08d3a90 + f6574dd
commit 2fb3690
Show file tree

Hide file tree

Showing 12 changed files with 83 additions and 52 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -3,8 +3,13 @@
 
 name: Python package
 
-on: push
-
+on:
+  push:
+    branches: [ master ]
+    tags:
+      - "v*"  # Tag events matching v*, i.e. v1.0, v20.15.10
+  pull_request:
+    branches: [ master ]
 jobs:
   build:
 

diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,6 @@
+### 1.0.3 (2023-11-20) ###
+- [x] `pairtools dedup`: update default chunksize to 10,000 to prevent memory overflow on datasets with high duplication rate 
+
 ### 1.0.2 (2022-11-XX) ###
 
 - [x] `pairtools select` regex update 

diff --git a/doc/conf.py b/doc/conf.py
@@ -19,7 +19,7 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-# sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, os.path.abspath('..'))
 
 # -- General configuration ------------------------------------------------
 
@@ -38,7 +38,6 @@
     "sphinx.ext.napoleon",
     "sphinx.ext.mathjax",
     "sphinx_click.ext",
-    "recommonmark",
     "nbsphinx",
     "sphinx_rtd_theme",
 ]
@@ -100,7 +99,7 @@ def get_version():
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = 'en'
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:

diff --git a/pairtools/__init__.py b/pairtools/__init__.py
@@ -4,12 +4,12 @@
 
 CLI tools to process mapped Hi-C data
 
-:copyright: (c) 2017-2022 Open2C
+:copyright: (c) 2017-2023 Open2C
 :author: Open2C
 :license: MIT
 
 """
 
-__version__ = "1.0.2"
+__version__ = "1.0.3"
 
 # from . import lib
diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py
@@ -113,7 +113,7 @@
 @click.option(
     "--chunksize",
     type=int,
-    default=100_000,
+    default=10_000,
     show_default=True,
     help="Number of pairs in each chunk. Reduce for lower memory footprint."
     " Below 10,000 performance starts suffering significantly and the algorithm might"

diff --git a/pairtools/lib/__init__.py b/pairtools/lib/__init__.py
@@ -1,6 +1,5 @@
 from . import fileio
 from . import dedup
-from . import dedup_cython
 from . import filterbycov
 from . import headerops
 from . import pairsam_format

diff --git a/pairtools/lib/dedup.py b/pairtools/lib/dedup.py
@@ -4,6 +4,7 @@
 import scipy.spatial
 from scipy.sparse import coo_matrix
 from scipy.sparse.csgraph import connected_components
+from csv import QUOTE_NONE
 
 from . import dedup_cython, pairsam_format
 from .stats import PairCounter
@@ -15,7 +16,8 @@
 
 # Ignore pandas future warnings:
 import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
+
+warnings.simplefilter(action="ignore", category=FutureWarning)
 
 # Setting for cython deduplication:
 # you don't need to load more than 10k lines at a time b/c you get out of the
@@ -92,7 +94,6 @@ def streaming_dedup(
     -------
 
     """
-
     deduped_chunks = _dedup_stream(
         in_stream=in_stream,
         colnames=colnames,
@@ -131,7 +132,7 @@ def streaming_dedup(
         # Stream the dups:
         if outstream_dups:
             df_chunk.loc[mask_mapped & mask_duplicates, :].to_csv(
-                outstream_dups, index=False, header=False, sep="\t"
+                outstream_dups, index=False, header=False, sep="\t", quoting=QUOTE_NONE
             )
 
         # Drop readID if it was created (not needed for nodup and unmapped pairs):
@@ -141,12 +142,16 @@ def streaming_dedup(
         # Stream unmapped:
         if outstream_unmapped:
             df_chunk.loc[~mask_mapped, :].to_csv(
-                outstream_unmapped, index=False, header=False, sep="\t"
+                outstream_unmapped,
+                index=False,
+                header=False,
+                sep="\t",
+                quoting=QUOTE_NONE,
             )
 
         # Stream unique pairs:
         df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv(
-            outstream, index=False, header=False, sep="\t"
+            outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
         )
 
     t1 = time.time()
@@ -173,9 +178,7 @@ def _dedup_stream(
     count_dups=False,
 ):
     # Stream the input dataframe:
-    dfs = pd.read_table(
-        in_stream, comment=None, names=colnames, chunksize=chunksize
-    )
+    dfs = pd.read_table(in_stream, comment=None, names=colnames, chunksize=chunksize)
 
     # Set up the carryover dataframe:
     df_prev_nodups = pd.DataFrame([])
@@ -445,7 +448,7 @@ def streaming_dedup_cython(
     read_idx = 0  # read index to mark the parent readID
     while True:
         rawline = next(instream, None)
-        stripline = rawline.strip('\n') if rawline else None
+        stripline = rawline.strip("\n") if rawline else None
 
         # take care of empty lines not at the end of the file separately
         if rawline and (not stripline):
@@ -461,7 +464,6 @@ def streaming_dedup_cython(
                 )
 
             if (cols[c1ind] == unmapped_chrom) or (cols[c2ind] == unmapped_chrom):
-
                 if outstream_unmapped:
                     outstream_unmapped.write(stripline)
                     # don't forget terminal newline
@@ -627,7 +629,6 @@ def mark_split_pair_as_dup(cols):
 
     if (len(cols) > pairsam_format.COL_SAM1) and (len(cols) > pairsam_format.COL_SAM2):
         for i in (pairsam_format.COL_SAM1, pairsam_format.COL_SAM2):
-
             # split each sam column into sam entries, tag and assemble back
             cols[i] = pairsam_format.INTER_SAM_SEP.join(
                 [

diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py
@@ -176,17 +176,13 @@ def __getitem__(self, key, filter="no_filter"):
                 # there is only genomic distance range of the bin that's left:
                 (bin_range,) = k_fields
                 # extract left border of the bin "1000000+" or "1500-6000":
-                dist_bin_left = (
+                dist_bin_left = int(
                     bin_range.strip("+")
                     if bin_range.endswith("+")
                     else bin_range.split("-")[0]
                 )
-                # get the index of that bin:
-                bin_idx = (
-                    np.searchsorted(self._dist_bins, int(dist_bin_left), "right") - 1
-                )
                 # store corresponding value:
-                return self._stat[filter]["dist_freq"][dirs][bin_idx]
+                return self._stat[filter]["dist_freq"][dirs][dist_bin_left]
             else:
                 raise ValueError(
                     "{} is not a valid key: {} section implies 2 identifiers".format(
@@ -337,20 +333,13 @@ def from_file(cls, file_handle):
                         # there is only genomic distance range of the bin that's left:
                         (bin_range,) = key_fields
                         # extract left border of the bin "1000000+" or "1500-6000":
-                        dist_bin_left = (
+                        dist_bin_left = int(
                             bin_range.strip("+")
                             if bin_range.endswith("+")
                             else bin_range.split("-")[0]
                         )
-                        # get the index of that bin:
-                        bin_idx = (
-                            np.searchsorted(
-                                stat_from_file._dist_bins, int(dist_bin_left), "right"
-                            )
-                            - 1
-                        )
                         # store corresponding value:
-                        stat_from_file._stat[default_filter][key][dirs][bin_idx] = int(
+                        stat_from_file._stat[default_filter][key][dirs][dist_bin_left] = int(
                             fields[1]
                         )
                     else:
@@ -380,9 +369,11 @@ def from_yaml(cls, file_handle):
             new PairCounter filled with the contents of the input file
         """
         # fill in from file - file_handle:
-        stat_from_file = cls()
-
         stat = yaml.safe_load(file_handle)
+        stat_from_file = cls(
+            filters={key: val.get("filter_expression", "") for key, val in stat.items()}
+        )
+
         for key, filter in stat.items():
             chromdict = {}
             for chroms in stat[key]["chrom_freq"].keys():
@@ -444,10 +435,10 @@ def add_pair(
                 if chrom1 == chrom2:
                     self._stat[filter]["cis"] += 1
                     dist = np.abs(pos2 - pos1)
-                    bin = self._dist_bins[
+                    dist_bin = self._dist_bins[
                         np.searchsorted(self._dist_bins, dist, "right") - 1
                     ]
-                    self._stat[filter]["dist_freq"][strand1 + strand2][bin] += 1
+                    self._stat[filter]["dist_freq"][strand1 + strand2][dist_bin] += 1
                     if dist >= 1000:
                         self._stat[filter]["cis_1kb+"] += 1
                     if dist >= 2000:
@@ -637,7 +628,6 @@ def __add__(self, other, filter="no_filter"):
                         ].get(union_key, 0) + other._stat[filter][k].get(union_key, 0)
                 elif k == "dist_freq":
                     for dirs in sum_stat[k]:
-
                         from functools import reduce
 
                         def reducer(accumulator, element):
@@ -701,17 +691,19 @@ def flatten(self, filter="no_filter"):
                 if (k == "dist_freq") and v:
                     for i in range(len(self._dist_bins)):
                         for dirs, freqs in v.items():
+                            dist = self._dist_bins[i]
                             # last bin is treated differently: "100000+" vs "1200-3000":
-                            if i != len(self._dist_bins) - 1:
-                                dist = self._dist_bins[i]
+                            if i < len(self._dist_bins) - 1:
                                 dist_next = self._dist_bins[i + 1]
                                 formatted_key = self._KEY_SEP.join(
                                     ["{}", "{}-{}", "{}"]
                                 ).format(k, dist, dist_next, dirs)
-                            else:
+                            elif i == len(self._dist_bins) - 1:
                                 formatted_key = self._KEY_SEP.join(
                                     ["{}", "{}+", "{}"]
                                 ).format(k, dist, dirs)
+                            else:
+                                raise ValueError("There is a mismatch between dist_freq bins in the instance")
                             # store key,value pair:
                             flat_stat[formatted_key] = freqs[dist]
                 elif (k in ["pair_types", "dedup", "chromsizes"]) and v:

diff --git a/readthedocs.yml b/readthedocs.yml
@@ -1,12 +1,17 @@
 
 # .readthedocs.yml
+version: 2
 
 build:
-  image: latest
+  os: ubuntu-22.04
+  tools:
+    python: "3.10"
 
-python:
-  version: 3.8
-  pip_install: true
-
-requirements_file: requirements_doc.txt
+sphinx:
+  configuration: doc/conf.py
 
+python:
+  install:
+    - requirements: requirements_doc.txt
+    - method: pip
+      path: .
diff --git a/requirements_doc.txt b/requirements_doc.txt
@@ -6,9 +6,10 @@ pandas
 pysam
 bioframe
 click>=7.0
-git+https://github.com/golobor/sphinx-click
+sphinx-click
 ipython
 nbsphinx
-Sphinx
+Sphinx>=7.0
 sphinx_rtd_theme
-docutils>0.16
+docutils>0.16
+-e .
diff --git a/tests/data/mock_empty.4dedup.pairsam b/tests/data/mock_empty.4dedup.pairsam
@@ -0,0 +1,6 @@
+## pairs format v1.0.0
+#sorted: chr1-chr2-pos1-pos2
+#shape: upper triangle
+#genome_assembly: unknown
+#chromosomes: chr1 chr2
+#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type sam1 sam2
diff --git a/tests/test_dedup.py b/tests/test_dedup.py
@@ -26,6 +26,9 @@
 
 max_mismatch = 1
 
+mock_empty_pairsam_path_dedup = os.path.join(testdir, "data", "mock_empty.4dedup.pairsam")
+empty_dedup_path = os.path.join(tmpdir_name, "empty_dedup.pairsam")
+
 
 @pytest.fixture
 def setup_dedup():
@@ -84,6 +87,17 @@ def setup_dedup():
                 str(max_mismatch),
             ],
         )
+        subprocess.check_output(
+            [
+                "python",
+                "-m",
+                "pairtools",
+                "dedup",
+                mock_empty_pairsam_path_dedup,
+                "--output",
+                empty_dedup_path,
+            ],
+        )
     except subprocess.CalledProcessError as e:
         print(e.output)
         print(sys.exc_info())
@@ -160,5 +174,11 @@ def pairs_overlap(pair1, pair2, max_mismatch):
                 for pair1 in dup_pairs
             ]
         )
+        empty_dedup_pairs = [
+            l.strip().split("\t")
+            for l in open(empty_dedup_path, "r")
+            if not l.startswith("#") and l.strip()
+        ]
+        assert len(empty_dedup_pairs) == 0
 
     tmpdir.cleanup()