Skip to content

Commit

Permalink
Merge branch 'master' into dedup_update
Browse files Browse the repository at this point in the history
  • Loading branch information
golobor authored Mar 9, 2024
2 parents 08d3a90 + f6574dd commit 2fb3690
Show file tree
Hide file tree
Showing 12 changed files with 83 additions and 52 deletions.
9 changes: 7 additions & 2 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@

name: Python package

on: push

on:
push:
branches: [ master ]
tags:
- "v*" # Tag events matching v*, i.e. v1.0, v20.15.10
pull_request:
branches: [ master ]
jobs:
build:

Expand Down
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
### 1.0.3 (2023-11-20) ###
- [x] `pairtools dedup`: update default chunksize to 10,000 to prevent memory overflow on datasets with high duplication rate

### 1.0.2 (2022-11-XX) ###

- [x] `pairtools select` regex update
Expand Down
5 changes: 2 additions & 3 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# sys.path.insert(0, os.path.abspath('.'))
sys.path.insert(0, os.path.abspath('..'))

# -- General configuration ------------------------------------------------

Expand All @@ -38,7 +38,6 @@
"sphinx.ext.napoleon",
"sphinx.ext.mathjax",
"sphinx_click.ext",
"recommonmark",
"nbsphinx",
"sphinx_rtd_theme",
]
Expand Down Expand Up @@ -100,7 +99,7 @@ def get_version():
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
language = 'en'

# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
Expand Down
4 changes: 2 additions & 2 deletions pairtools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
CLI tools to process mapped Hi-C data
:copyright: (c) 2017-2022 Open2C
:copyright: (c) 2017-2023 Open2C
:author: Open2C
:license: MIT
"""

__version__ = "1.0.2"
__version__ = "1.0.3"

# from . import lib
2 changes: 1 addition & 1 deletion pairtools/cli/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
@click.option(
"--chunksize",
type=int,
default=100_000,
default=10_000,
show_default=True,
help="Number of pairs in each chunk. Reduce for lower memory footprint."
" Below 10,000 performance starts suffering significantly and the algorithm might"
Expand Down
1 change: 0 additions & 1 deletion pairtools/lib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from . import fileio
from . import dedup
from . import dedup_cython
from . import filterbycov
from . import headerops
from . import pairsam_format
Expand Down
23 changes: 12 additions & 11 deletions pairtools/lib/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import scipy.spatial
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import connected_components
from csv import QUOTE_NONE

from . import dedup_cython, pairsam_format
from .stats import PairCounter
Expand All @@ -15,7 +16,8 @@

# Ignore pandas future warnings:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

warnings.simplefilter(action="ignore", category=FutureWarning)

# Setting for cython deduplication:
# you don't need to load more than 10k lines at a time b/c you get out of the
Expand Down Expand Up @@ -92,7 +94,6 @@ def streaming_dedup(
-------
"""

deduped_chunks = _dedup_stream(
in_stream=in_stream,
colnames=colnames,
Expand Down Expand Up @@ -131,7 +132,7 @@ def streaming_dedup(
# Stream the dups:
if outstream_dups:
df_chunk.loc[mask_mapped & mask_duplicates, :].to_csv(
outstream_dups, index=False, header=False, sep="\t"
outstream_dups, index=False, header=False, sep="\t", quoting=QUOTE_NONE
)

# Drop readID if it was created (not needed for nodup and unmapped pairs):
Expand All @@ -141,12 +142,16 @@ def streaming_dedup(
# Stream unmapped:
if outstream_unmapped:
df_chunk.loc[~mask_mapped, :].to_csv(
outstream_unmapped, index=False, header=False, sep="\t"
outstream_unmapped,
index=False,
header=False,
sep="\t",
quoting=QUOTE_NONE,
)

# Stream unique pairs:
df_chunk.loc[mask_mapped & (~mask_duplicates), :].to_csv(
outstream, index=False, header=False, sep="\t"
outstream, index=False, header=False, sep="\t", quoting=QUOTE_NONE
)

t1 = time.time()
Expand All @@ -173,9 +178,7 @@ def _dedup_stream(
count_dups=False,
):
# Stream the input dataframe:
dfs = pd.read_table(
in_stream, comment=None, names=colnames, chunksize=chunksize
)
dfs = pd.read_table(in_stream, comment=None, names=colnames, chunksize=chunksize)

# Set up the carryover dataframe:
df_prev_nodups = pd.DataFrame([])
Expand Down Expand Up @@ -445,7 +448,7 @@ def streaming_dedup_cython(
read_idx = 0 # read index to mark the parent readID
while True:
rawline = next(instream, None)
stripline = rawline.strip('\n') if rawline else None
stripline = rawline.strip("\n") if rawline else None

# take care of empty lines not at the end of the file separately
if rawline and (not stripline):
Expand All @@ -461,7 +464,6 @@ def streaming_dedup_cython(
)

if (cols[c1ind] == unmapped_chrom) or (cols[c2ind] == unmapped_chrom):

if outstream_unmapped:
outstream_unmapped.write(stripline)
# don't forget terminal newline
Expand Down Expand Up @@ -627,7 +629,6 @@ def mark_split_pair_as_dup(cols):

if (len(cols) > pairsam_format.COL_SAM1) and (len(cols) > pairsam_format.COL_SAM2):
for i in (pairsam_format.COL_SAM1, pairsam_format.COL_SAM2):

# split each sam column into sam entries, tag and assemble back
cols[i] = pairsam_format.INTER_SAM_SEP.join(
[
Expand Down
38 changes: 15 additions & 23 deletions pairtools/lib/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,17 +176,13 @@ def __getitem__(self, key, filter="no_filter"):
# there is only genomic distance range of the bin that's left:
(bin_range,) = k_fields
# extract left border of the bin "1000000+" or "1500-6000":
dist_bin_left = (
dist_bin_left = int(
bin_range.strip("+")
if bin_range.endswith("+")
else bin_range.split("-")[0]
)
# get the index of that bin:
bin_idx = (
np.searchsorted(self._dist_bins, int(dist_bin_left), "right") - 1
)
# store corresponding value:
return self._stat[filter]["dist_freq"][dirs][bin_idx]
return self._stat[filter]["dist_freq"][dirs][dist_bin_left]
else:
raise ValueError(
"{} is not a valid key: {} section implies 2 identifiers".format(
Expand Down Expand Up @@ -337,20 +333,13 @@ def from_file(cls, file_handle):
# there is only genomic distance range of the bin that's left:
(bin_range,) = key_fields
# extract left border of the bin "1000000+" or "1500-6000":
dist_bin_left = (
dist_bin_left = int(
bin_range.strip("+")
if bin_range.endswith("+")
else bin_range.split("-")[0]
)
# get the index of that bin:
bin_idx = (
np.searchsorted(
stat_from_file._dist_bins, int(dist_bin_left), "right"
)
- 1
)
# store corresponding value:
stat_from_file._stat[default_filter][key][dirs][bin_idx] = int(
stat_from_file._stat[default_filter][key][dirs][dist_bin_left] = int(
fields[1]
)
else:
Expand Down Expand Up @@ -380,9 +369,11 @@ def from_yaml(cls, file_handle):
new PairCounter filled with the contents of the input file
"""
# fill in from file - file_handle:
stat_from_file = cls()

stat = yaml.safe_load(file_handle)
stat_from_file = cls(
filters={key: val.get("filter_expression", "") for key, val in stat.items()}
)

for key, filter in stat.items():
chromdict = {}
for chroms in stat[key]["chrom_freq"].keys():
Expand Down Expand Up @@ -444,10 +435,10 @@ def add_pair(
if chrom1 == chrom2:
self._stat[filter]["cis"] += 1
dist = np.abs(pos2 - pos1)
bin = self._dist_bins[
dist_bin = self._dist_bins[
np.searchsorted(self._dist_bins, dist, "right") - 1
]
self._stat[filter]["dist_freq"][strand1 + strand2][bin] += 1
self._stat[filter]["dist_freq"][strand1 + strand2][dist_bin] += 1
if dist >= 1000:
self._stat[filter]["cis_1kb+"] += 1
if dist >= 2000:
Expand Down Expand Up @@ -637,7 +628,6 @@ def __add__(self, other, filter="no_filter"):
].get(union_key, 0) + other._stat[filter][k].get(union_key, 0)
elif k == "dist_freq":
for dirs in sum_stat[k]:

from functools import reduce

def reducer(accumulator, element):
Expand Down Expand Up @@ -701,17 +691,19 @@ def flatten(self, filter="no_filter"):
if (k == "dist_freq") and v:
for i in range(len(self._dist_bins)):
for dirs, freqs in v.items():
dist = self._dist_bins[i]
# last bin is treated differently: "100000+" vs "1200-3000":
if i != len(self._dist_bins) - 1:
dist = self._dist_bins[i]
if i < len(self._dist_bins) - 1:
dist_next = self._dist_bins[i + 1]
formatted_key = self._KEY_SEP.join(
["{}", "{}-{}", "{}"]
).format(k, dist, dist_next, dirs)
else:
elif i == len(self._dist_bins) - 1:
formatted_key = self._KEY_SEP.join(
["{}", "{}+", "{}"]
).format(k, dist, dirs)
else:
raise ValueError("There is a mismatch between dist_freq bins in the instance")
# store key,value pair:
flat_stat[formatted_key] = freqs[dist]
elif (k in ["pair_types", "dedup", "chromsizes"]) and v:
Expand Down
17 changes: 11 additions & 6 deletions readthedocs.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@

# .readthedocs.yml
version: 2

build:
image: latest
os: ubuntu-22.04
tools:
python: "3.10"

python:
version: 3.8
pip_install: true

requirements_file: requirements_doc.txt
sphinx:
configuration: doc/conf.py

python:
install:
- requirements: requirements_doc.txt
- method: pip
path: .
7 changes: 4 additions & 3 deletions requirements_doc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ pandas
pysam
bioframe
click>=7.0
git+https://github.com/golobor/sphinx-click
sphinx-click
ipython
nbsphinx
Sphinx
Sphinx>=7.0
sphinx_rtd_theme
docutils>0.16
docutils>0.16
-e .
6 changes: 6 additions & 0 deletions tests/data/mock_empty.4dedup.pairsam
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## pairs format v1.0.0
#sorted: chr1-chr2-pos1-pos2
#shape: upper triangle
#genome_assembly: unknown
#chromosomes: chr1 chr2
#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type sam1 sam2
20 changes: 20 additions & 0 deletions tests/test_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@

max_mismatch = 1

mock_empty_pairsam_path_dedup = os.path.join(testdir, "data", "mock_empty.4dedup.pairsam")
empty_dedup_path = os.path.join(tmpdir_name, "empty_dedup.pairsam")


@pytest.fixture
def setup_dedup():
Expand Down Expand Up @@ -84,6 +87,17 @@ def setup_dedup():
str(max_mismatch),
],
)
subprocess.check_output(
[
"python",
"-m",
"pairtools",
"dedup",
mock_empty_pairsam_path_dedup,
"--output",
empty_dedup_path,
],
)
except subprocess.CalledProcessError as e:
print(e.output)
print(sys.exc_info())
Expand Down Expand Up @@ -160,5 +174,11 @@ def pairs_overlap(pair1, pair2, max_mismatch):
for pair1 in dup_pairs
]
)
empty_dedup_pairs = [
l.strip().split("\t")
for l in open(empty_dedup_path, "r")
if not l.startswith("#") and l.strip()
]
assert len(empty_dedup_pairs) == 0

tmpdir.cleanup()

0 comments on commit 2fb3690

Please sign in to comment.