Skip to content

Commit

Permalink
__init__.py Clustering added
Browse files Browse the repository at this point in the history
BismarkPlot.py Clustering done and confidence bands
console_metagene.py confidence bands added
index.rst new classes and methods added in documentation
  • Loading branch information
shitohana committed Oct 27, 2023
1 parent b1a671d commit ff436f6
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 42 deletions.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ This page gives an overview of all public objects, functions and methods.
bismarkplot.MetageneFiles
bismarkplot.Genome
bismarkplot.ChrLevels
bismarkplot.Clustering

bismarkplot.BismarkPlot.BismarkBase
bismarkplot.BismarkPlot.LinePlot
Expand Down
153 changes: 116 additions & 37 deletions src/bismarkplot/BismarkPlot.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gzip
import re
from functools import cache
from multiprocessing import cpu_count
from os.path import getsize

Expand All @@ -15,6 +16,7 @@
from scipy.signal import savgol_filter
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy import stats

from pandas import DataFrame as pdDataFrame
from pyreadr import write_rds
Expand Down Expand Up @@ -74,6 +76,7 @@ def from_gff(cls, file: str):
dtypes={'start': pl.Int32, 'end': pl.Int32, 'chr': pl.Utf8}
).select(['chr', 'type', 'start', 'end', 'strand'])

print(f"Genome read from {file}")
return cls(genes)

def gene_body(self, min_length: int = 4000, flank_length: int = 2000) -> pl.DataFrame:
Expand Down Expand Up @@ -288,7 +291,7 @@ def __check_empty(genes):

class BismarkBase:
"""
Base class for :class:`Bismark` and plots.
Base class for :class:`Metagene` and plots.
"""

def __init__(self, bismark_df: pl.DataFrame, **kwargs):
Expand Down Expand Up @@ -341,8 +344,8 @@ def save_rds(self, filename, compress: bool = False):
"""
Save Bismark DataFrame in Rds.
:param filename: path for file.
:param compress: whether to compress to gzip or not.
:param filename: Path for file.
:param compress: Whether to compress to gzip or not.
"""
write_rds(filename, self.bismark.to_pandas(),
compress="gzip" if compress else None)
Expand All @@ -351,8 +354,8 @@ def save_tsv(self, filename, compress=False):
"""
Save Bismark DataFrame in TSV.
:param filename: path for file.
:param compress: whether to compress to gzip or not.
:param filename: Path for file.
:param compress: Whether to compress to gzip or not.
"""
if compress:
with gzip.open(filename + ".gz", "wb") as file:
Expand All @@ -370,7 +373,17 @@ def __len__(self):


class Clustering(BismarkBase):
def __init__(self, bismark_df: pl.DataFrame, count_threshold=5, **kwargs):
"""
Class for clustering genes within sample
"""

def __init__(self, bismark_df: pl.DataFrame, count_threshold=5, dist_method="euclidean", clust_method="average", **kwargs):
"""
:param bismark_df: :class:polars.DataFrame with genes data
:param count_threshold: Minimum counts per fragment
:param dist_method: Method for evaluating distance
:param clust_method: Method for hierarchical clustering
"""
super().__init__(bismark_df, **kwargs)

grouped = (
Expand Down Expand Up @@ -412,14 +425,25 @@ def __init__(self, bismark_df: pl.DataFrame, count_threshold=5, **kwargs):

# dist matrix
print("Distances calculation")
self.dist = pdist(self.matrix, metric="euclidean")
self.dist = pdist(self.matrix, metric=dist_method)
# linkage matrix
print("Linkage calculation and minimizing distances")
self.linkage = linkage(self.dist, method="average", optimal_ordering=True)
self.linkage = linkage(self.dist, method=clust_method, optimal_ordering=True)

self.order = leaves_list(self.linkage)

def dynamicTreeCut(self, **kwargs):
self.tree = None

@cache
def dynamicTreeCut(self, **kwargs) -> dict:
"""
Method for asigning genes into modules with dynamic tree cut algorithm.
:param kwargs: all arguements for dynamicTreeCut
:return: tree dictionary
"""
print("WARNING: dynamicTreeCut can take very long time to run")
return cutreeHybrid(self.linkage, self.dist, **kwargs)

def draw(
Expand All @@ -432,8 +456,6 @@ def draw(
:param fig_axes: Tuple with (fig, axes) from :meth:`matplotlib.plt.subplots`.
:param title: Title of the plot.
:param vmin: Minimum for colormap.
:param vmax: Maximum for colormap.
:return:
"""
if fig_axes is None:
Expand Down Expand Up @@ -504,9 +526,9 @@ def from_file(
"""
Initialize ChrLevels with CX_report file
:param file: path to file
:param chr_min_length: minimum length of chromosome to be analyzed
:param window_length: length of windows in bp
:param file: Path to file
:param chr_min_length: Minimum length of chromosome to be analyzed
:param window_length: Length of windows in bp
:param cpu: How many cores to use. Uses every physical core by default
:param batch_size: Number of rows to read by one CPU core
"""
Expand Down Expand Up @@ -578,9 +600,9 @@ def save_plot_rds(self, path, compress: bool = False):

def filter(self, context: str = None, strand: str = None, chr: str = None):
"""
:param context: methylation context (CG, CHG, CHH) to filter (only one).
:param strand: strand to filter (+ or -).
:param chr: chromosome name to filter.
:param context: Methylation context (CG, CHG, CHH) to filter (only one).
:param strand: Strand to filter (+ or -).
:param chr: Chromosome name to filter.
:return: Filtered :class:`Bismark`.
"""
context_filter = self.bismark["context"] == context if context is not None else True
Expand Down Expand Up @@ -664,7 +686,7 @@ def from_file(
Constructor from Bismark coverage2cytosine output.
:param cpu: How many cores to use. Uses every physical core by default
:param file: path to bismark genomeWide report
:param file: Path to bismark genomeWide report
:param genome: polars.Dataframe with gene ranges
:param upstream_windows: Number of windows flank regions to split
:param downstream_windows: Number of windows flank regions to split
Expand Down Expand Up @@ -817,9 +839,9 @@ def process_batch(df: pl.DataFrame):

def filter(self, context: str = None, strand: str = None, chr: str = None):
"""
:param context: methylation context (CG, CHG, CHH) to filter (only one).
:param strand: strand to filter (+ or -).
:param chr: chromosome name to filter.
:param context: Methylation context (CG, CHG, CHH) to filter (only one).
:param strand: Strand to filter (+ or -).
:param chr: Chromosome name to filter.
:return: Filtered :class:`Bismark`.
"""
context_filter = self.bismark["context"] == context if context is not None else True
Expand All @@ -840,7 +862,7 @@ def resize(self, to_fragments: int = None):
"""
Modify DataFrame to fewer fragments.
:param to_fragments: number of final fragments.
:param to_fragments: Number of final fragments.
:return: Resized :class:`Bismark`.
"""
if self.upstream_windows is not None and self.gene_windows is not None and self.downstream_windows is not None:
Expand Down Expand Up @@ -879,8 +901,8 @@ def trim_flank(self, upstream=True, downstream=True):
"""
Trim fragments
:param upstream: keep upstream?
:param downstream: keep downstream?
:param upstream: Keep upstream region?
:param downstream: Keep downstream region?
:return: Trimmed :class:`Bismark`.
"""
trimmed = self.bismark.lazy()
Expand All @@ -902,18 +924,18 @@ def trim_flank(self, upstream=True, downstream=True):

return self.__class__(trimmed.collect(), **metadata)

def clustering(self, count_threshold = 5):
def clustering(self, count_threshold = 5, dist_method="euclidean", clust_method="average"):
"""
Gives an order for genes in specified method.
*WARNING* - experimental function. May be very slow!
:param dist_method: Distance method to use. See :meth:`scipy.spatial.distance.pdist`
:param clust_method: Clustering method to use. See :meth:`scipy.cluster.hierarchy.linkage`
:return: list of indexes of ordered rows.
:return: List of indexes of ordered rows.
"""

return Clustering(self.bismark, count_threshold, **self.metadata)
return Clustering(self.bismark, count_threshold, dist_method, clust_method, **self.metadata)

def line_plot(self, resolution: int = None):
"""
Expand All @@ -940,15 +962,38 @@ def __init__(self, bismark_df: pl.DataFrame, **kwargs):
"""
super().__init__(bismark_df, **kwargs)

self.plot_data = self.bismark.group_by("fragment").agg(
self.plot_data = self.bismark.group_by("fragment").agg([
pl.col("sum"), pl.col("count"),
(pl.sum("sum") / pl.sum("count")).alias("density")
)
]).sort("fragment")

if self.strand == '-':
max_fragment = self.plot_data["fragment"].max()
self.plot_data = self.plot_data.with_columns(
(max_fragment - pl.col("fragment")).alias("fragment"))

@staticmethod
def __interval(sum_density: list[int], sum_counts: list[int], alpha=.95):
"""
Evaluate confidence interval for point
:param sum_density: Sums of methylated counts in fragment
:param sum_counts: Sums of all read cytosines in fragment
:param alpha: Probability for confidence band
"""
sum_density, sum_counts = np.array(sum_density), np.array(sum_counts)
average = sum_density.sum() / sum_counts.sum()

normalized = np.divide(sum_density, sum_counts)

variance = np.average((normalized - average) ** 2, weights=sum_counts)

n = sum(sum_counts) - 1

i = stats.t.interval(alpha, df=n, loc=average, scale=np.sqrt(variance / n))

return {"lower": i[0], "upper": i[1]}

def save_plot_rds(self, path, compress: bool = False):
"""
Saves plot data in a rds DataFrame with columns:
Expand All @@ -959,14 +1004,18 @@ def save_plot_rds(self, path, compress: bool = False):
| Int | Float |
+----------+---------+
"""
write_rds(path, self.plot_data.to_pandas(),
df = self.bismark.group_by("fragment").agg(
(pl.sum("sum") / pl.sum("count")).alias("density")
)
write_rds(path, df.to_pandas(),
compress="gzip" if compress else None)

def draw(
self,
fig_axes: tuple = None,
smooth: int = 10,
label: str = None,
confidence = 0,
linewidth: float = 1.0,
linestyle: str = '-',
) -> Figure:
Expand All @@ -976,6 +1025,7 @@ def draw(
:param fig_axes: Tuple with (fig, axes) from :meth:`matplotlib.plt.subplots`
:param smooth: Window for SavGol filter. (see :meth:`scipy.signal.savgol`)
:param label: Label of the plot
:param confidence: Probability for confidence bands. 0 for disabled.
:param linewidth: See matplotlib documentation.
:param linestyle: See matplotlib documentation.
:return:
Expand All @@ -985,7 +1035,21 @@ def draw(
else:
fig, axes = fig_axes

data = self.plot_data.sort("fragment")["density"]
if 0 < confidence < 1:
df = (
self.plot_data
.with_columns(
pl.struct(["sum", "count"]).map_elements(
lambda x: self.__interval(x["sum"], x["count"], confidence)
).alias("interval")
)
.unnest("interval")
.select(["fragment", "lower", "density", "upper"])
)
else:
df = self.plot_data

data = df["density"]

polyorder = 3
window = smooth if smooth > polyorder else polyorder + 1
Expand All @@ -995,11 +1059,25 @@ def draw(

x = np.arange(len(data))
data = data * 100 # convert to percents
axes.plot(x, data, label=label,

axes.plot(x, data,
label=label if label is not None else "_",
linestyle=linestyle, linewidth=linewidth)

if 0 < confidence < 1:
upper = df["upper"].to_numpy() * 100 # convert to percents
lower = df["lower"].to_numpy() * 100 # convert to percents

upper = savgol_filter(upper, window, 3, mode="nearest") if smooth else upper
lower = savgol_filter(lower, window, 3, mode="nearest") if smooth else lower

axes.fill_between(x, lower, upper, alpha=.2)

self.__add_flank_lines(axes)

axes.legend()
if label is not None:
axes.legend()

axes.set_ylabel('Methylation density, %')
axes.set_xlabel('Position')

Expand Down Expand Up @@ -1148,10 +1226,10 @@ def __add_flank_lines(self, axes: plt.Axes):
x_ticks = []
x_labels = []
if self.upstream_windows > 0:
x_ticks.append(self.upstream_windows - 1)
x_ticks.append(self.upstream_windows - .5)
x_labels.append('TSS')
if self.downstream_windows > 0:
x_ticks.append(self.gene_windows + self.upstream_windows)
x_ticks.append(self.gene_windows + self.upstream_windows - .5)
x_labels.append('TES')

if x_ticks and x_labels:
Expand Down Expand Up @@ -1348,15 +1426,16 @@ def box_plot(self, fig_axes: tuple = None, showfliers=False):
class LinePlotFiles(BismarkFilesBase):
def draw(
self,
smooth: float = .05,
smooth: int = 10,
linewidth: float = 1.0,
linestyle: str = '-',
confidence=0
):
plt.clf()
fig, axes = plt.subplots()
for lp, label in zip(self.samples, self.labels):
assert isinstance(lp, LinePlot)
lp.draw((fig, axes), smooth, label, linewidth, linestyle)
lp.draw((fig, axes), smooth, label, confidence, linewidth, linestyle)

return fig

Expand Down
2 changes: 1 addition & 1 deletion src/bismarkplot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .BismarkPlot import Metagene, MetageneFiles, Genome, ChrLevels
from .BismarkPlot import Metagene, MetageneFiles, Genome, ChrLevels, Clustering

__version__ = 1.2
9 changes: 5 additions & 4 deletions src/bismarkplot/console_metagene.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

parser.add_argument('-S', '--smooth', help='windows for smoothing', type=float, default=10)
parser.add_argument('-L', '--labels', help='labels for plots', nargs='+')
parser.add_argument('-C', '--confidence', help='probability for confidence bands for line-plot. 0 if disabled', type=float, default=0)
parser.add_argument('-H', help='vertical resolution for heat-map', type=int, default=100)
parser.add_argument('-V', help='vertical resolution for heat-map', type=int, default=100)
parser.add_argument("--dpi", help="dpi of output plot", type=int, default=200)
Expand Down Expand Up @@ -69,13 +70,13 @@ def main():
base_name = args.out + "_" + context + strand + "_{type}." + args.format

if args.line_plot:
filtered.line_plot().draw(smooth=args.smooth).savefig(base_name.format(type = "line-plot"), dpi = args.dpi)
filtered.line_plot().draw(smooth=args.smooth, confidence=args.confidence).savefig(base_name.format(type="line-plot"), dpi = args.dpi)
if args.heat_map:
filtered.heat_map(args.hresolution, args.vresolution).draw().savefig(base_name.format(type = "heat-map"), dpi = args.dpi)
filtered.heat_map(args.hresolution, args.vresolution).draw().savefig(base_name.format(type="heat-map"), dpi=args.dpi)
if args.box_plot:
filtered.trim_flank().box_plot().savefig(base_name.format(type = "box-plot"), dpi = args.dpi)
filtered.trim_flank().box_plot().savefig(base_name.format(type="box-plot"), dpi=args.dpi)
if args.violin_plot:
filtered.trim_flank().violin_plot().savefig(base_name.format(type = "violin-plot"), dpi = args.dpi)
filtered.trim_flank().violin_plot().savefig(base_name.format(type="violin-plot"), dpi=args.dpi)

except Exception:
filename = f'error{datetime.now().strftime("%m_%d_%H:%M")}.txt'
Expand Down

0 comments on commit ff436f6

Please sign in to comment.