Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH continuous covariates #163

Merged
merged 13 commits into from
Aug 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ It aims to facilitate DEA experiments for python users.
As PyDESeq2 is a re-implementation of [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) from
scratch, you may experience some differences in terms of retrieved values or available features.

Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor
and n-level multi-factor analysis (with categorical factors), but we plan to implement more in the future.
Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor and
multi-factor analysis (with categorical or continuous factors) using Wald tests.
We plan to implement more in the future.
In case there is a feature you would particularly like to be implemented, feel free to open an issue.

## Table of Contents
Expand Down Expand Up @@ -129,8 +130,9 @@ Here are some of the features and improvements we plan to implement in the futur
- [x] Variance-stabilizing transformation
- [ ] Improving multi-factor analysis:
* [x] Allowing n-level factors
* [x] Support for continuous covariates
* [ ] Implementing interaction terms
* [ ] Support for continuous covariates


## Citing this work

Expand Down
10 changes: 9 additions & 1 deletion pydeseq2/dds.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,11 @@ class DeseqDataSet(ad.AnnData):

design_factors : str or list
Name of the columns of metadata to be used as design variables.
Only categorial factors are supported. (default: ``'condition'``).
(default: ``'condition'``).

continuous_factors : list or None
An optional list of continuous (as opposed to categorical) factors. Any factor
not in ``continuous_factors`` will be considered categorical (default: ``None``).

ref_level : list or None
An optional list of two strings of the form ``["factor", "test_level"]``
Expand Down Expand Up @@ -170,6 +174,7 @@ def __init__(
counts: Optional[pd.DataFrame] = None,
metadata: Optional[pd.DataFrame] = None,
design_factors: Union[str, List[str]] = "condition",
continuous_factors: Optional[List[str]] = None,
ref_level: Optional[List[str]] = None,
min_mu: float = 0.5,
min_disp: float = 1e-8,
Expand Down Expand Up @@ -209,6 +214,8 @@ def __init__(
self.design_factors = (
[design_factors] if isinstance(design_factors, str) else design_factors
)
self.continuous_factors = continuous_factors

if self.obs[self.design_factors].isna().any().any():
raise ValueError("NaNs are not allowed in the design factors.")
self.obs[self.design_factors] = self.obs[self.design_factors].astype(str)
Expand All @@ -218,6 +225,7 @@ def __init__(
self.obsm["design_matrix"] = build_design_matrix(
metadata=self.obs,
design_factors=self.design_factors,
continuous_factors=self.continuous_factors,
ref_level=ref_level,
expanded=False,
intercept=True,
Expand Down
100 changes: 68 additions & 32 deletions pydeseq2/ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ class DeseqStats:
``['variable_of_interest', 'tested_level', 'ref_level']``.
Names must correspond to the metadata data passed to the DeseqDataSet.
E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B' compared
to 'condition A'. If None, the last variable from the design matrix is chosen
to 'condition A'.
For continuous variables, the last two strings should be left empty, e.g.
``['measurement', '', ''].
If None, the last variable from the design matrix is chosen
as the variable of interest, and the reference level is picked alphabetically.
(default: ``None``).

Expand Down Expand Up @@ -206,11 +209,14 @@ def summary(self) -> None:
self.results_df["pvalue"] = self.p_values
self.results_df["padj"] = self.padj

if not self.quiet:
if self.contrast[1] == self.contrast[2] == "":
# The factor is continuous
print(f"Log2 fold change & Wald test p-value: " f"{self.contrast[0]}")
else:
# The factor is categorical
print(
f"Log2 fold change & Wald test p-value: "
f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}",
file=sys.stderr,
f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}"
)
display(self.results_df)

Expand Down Expand Up @@ -298,7 +304,14 @@ def lfc_shrink(self, coeff: Optional[str] = None) -> None:
(default: ``None``).
"""

contrast_level = f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}"
if self.contrast[1] == self.contrast[2] == "":
# The factor being tested is continuous
contrast_level = self.contrast[0]
else:
# The factor being tested is categorical
contrast_level = (
f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}"
)

if coeff is not None:
if coeff not in self.LFC.columns:
Expand Down Expand Up @@ -400,17 +413,21 @@ def lfc_shrink(self, coeff: Optional[str] = None) -> None:
if hasattr(self, "results_df"):
self.results_df["log2FoldChange"] = self.LFC.iloc[:, coeff_idx] / np.log(2)
self.results_df["lfcSE"] = self.SE / np.log(2)

# Get the corrresponding factor, tested and reference levels of the shrunk
# Get the corresponding factor, tested and reference levels of the shrunk
# coefficient
split_coeff = coeff.split("_")
# coeffs are of the form "factor_A_vs_B", hence "factor" is split_coeff[0],
# "A" is split_coeff[1] and "B" split_coeff[3]
if not self.quiet:
# Categorical coeffs are of the form "factor_A_vs_B", and continuous coeffs
# of the form "factor".
if len(split_coeff) == 1:
# The factor is continuous
print(f"Shrunk log2 fold change & Wald test p-value: " f"{coeff}")
else:
# The factor is categorical
# Categorical coeffs are of the form "factor_A_vs_B", hence "factor"
# is split_coeff[0], "A" is split_coeff[1] and "B" split_coeff[3]
print(
f"Shrunk Log2 fold change & Wald test p-value: "
f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}",
file=sys.stderr,
f"Shrunk log2 fold change & Wald test p-value: "
f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}"
)

display(self.results_df)
Expand Down Expand Up @@ -613,7 +630,10 @@ def _build_contrast(self, contrast: Optional[List[str]] = None) -> None:
``['variable_of_interest', 'tested_level', 'reference_level']``.
Names must correspond to the metadata data passed to the DeseqDataSet.
E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B'
compared to 'condition A'. If None, the last variable from the design matrix
compared to 'condition A'.
For continuous variables, the last two strings will be left empty, e.g.
``['measurement', '', ''].
If None, the last variable from the design matrix
is chosen as the variable of interest, and the reference level is picked
alphabetically.

Expand All @@ -633,26 +653,38 @@ def _build_contrast(self, contrast: Optional[List[str]] = None) -> None:
f"The contrast variable ('{contrast[0]}') should be one "
f"of the design factors."
)
if contrast[1] not in self.dds.obs[contrast[0]].values:
raise KeyError(
f"The tested level ('{contrast[1]}') should correspond to one of the"
f" levels of '{contrast[0]}'"
)
if contrast[2] not in self.dds.obs[contrast[0]].values:
raise KeyError(
f"The reference level ('{contrast[2]}') should correspond to one of"
f" the levels of '{contrast[0]}'"
)
if not (contrast[1] == contrast[2] == ""):
# The contrast factor is categorical, so we should check that the tested
# and reference levels are valid.
if contrast[1] not in self.dds.obs[contrast[0]].values:
raise KeyError(
f"The tested level ('{contrast[1]}') should correspond to "
f"one of the levels of '{contrast[0]}'"
)
if contrast[2] not in self.dds.obs[contrast[0]].values:
raise KeyError(
f"The reference level ('{contrast[2]}') should correspond to "
f"one of the levels of '{contrast[0]}'"
)
self.contrast = contrast
else: # Build contrast if None
factor = self.dds.design_factors[-1]
factor_col = next(
col
for col in self.dds.obsm["design_matrix"].columns
if col.startswith(factor)
)
split_col = factor_col.split("_")
self.contrast = [split_col[0], split_col[1], split_col[-1]]
# Check whether this factor is categorical or continuous.
if (
self.dds.continuous_factors is not None
and factor in self.dds.continuous_factors
):
# The factor is continuous
self.contrast = [factor, "", ""]
else:
# The factor is categorical
factor_col = next(
col
for col in self.dds.obsm["design_matrix"].columns
if col.startswith(factor)
)
split_col = factor_col.split("_")
self.contrast = [split_col[0], split_col[1], split_col[-1]]

def _build_contrast_vector(self) -> None:
"""
Expand All @@ -663,7 +695,11 @@ def _build_contrast_vector(self) -> None:
factor = self.contrast[0]
alternative = self.contrast[1]
ref = self.contrast[2]
contrast_level = f"{factor}_{alternative}_vs_{ref}"
if ref == alternative == "":
# "factor" is a continuous variable
contrast_level = factor
else:
contrast_level = f"{factor}_{alternative}_vs_{ref}"

self.contrast_vector = np.zeros(self.LFC.shape[-1])
if contrast_level in self.design_matrix.columns:
Expand Down
119 changes: 73 additions & 46 deletions pydeseq2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,13 @@ def build_design_matrix(
metadata: pd.DataFrame,
design_factors: Union[str, List[str]] = "condition",
ref_level: Optional[List[str]] = None,
continuous_factors: Optional[List[str]] = None,
expanded: bool = False,
intercept: bool = True,
) -> pd.DataFrame:
"""Build design_matrix matrix for DEA.

Only single factor, 2-level designs are currently supported.
Unless specified, the reference factor is chosen alphabetically.
NOTE: For now, each factor should have exactly two levels.

Parameters
----------
Expand All @@ -160,13 +159,18 @@ def build_design_matrix(
Name of the columns of metadata to be used as design_matrix variables.
(default: ``"condition"``).

ref_level : list or None
ref_level : dict or None
An optional list of two strings of the form ``["factor", "ref_level"]``
specifying the factor of interest and the desired reference level, e.g.
``["condition", "A"]``. (default: ``None``).

continuous_factors : list or None
An optional list of continuous (as opposed to categorical) factors. Any factor
not in ``continuous_factors`` will be considered categorical (default: ``None``).

expanded : bool
If true, use one column per category. Else, use a single column.
If true, use one column per category. Else, use n-1 columns, for each n-level
categorical factor.
(default: ``False``).

intercept : bool
Expand All @@ -192,55 +196,78 @@ def build_design_matrix(
f"takes the single value '{np.unique(metadata[factor])}'."
)

design_matrix = pd.get_dummies(metadata[design_factors], drop_first=not expanded)
if continuous_factors is not None:
categorical_factors = [
factor for factor in design_factors if factor not in continuous_factors
]
else:
categorical_factors = design_factors

if ref_level is not None:
if len(ref_level) != 2:
raise KeyError("The reference level should contain 2 strings.")
if ref_level[1] not in metadata[ref_level[0]].values:
raise KeyError(
f"The metadata data should contain a '{ref_level[0]}' column"
f" with a '{ref_level[1]}' level."
)
# Check that there is at least one categorical factor
if len(categorical_factors) > 0:
design_matrix = pd.get_dummies(
metadata[categorical_factors], drop_first=not expanded
)

# Check that the reference level is not in the matrix (if unexpanded design)
ref_level_name = "_".join(ref_level)
if (not expanded) and ref_level_name in design_matrix.columns:
# Remove the reference level and add one
factor_cols = [
col for col in design_matrix.columns if col.startswith(ref_level[0])
]
missing_level = next(
level
for level in np.unique(metadata[ref_level[0]])
if f"{ref_level[0]}_{level}" not in design_matrix.columns
)
design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[
factor_cols
].sum(1)
design_matrix.drop(ref_level_name, axis="columns", inplace=True)

if not expanded:
# Add reference level as column name suffix
for factor in design_factors:
if ref_level is None or factor != ref_level[0]:
# The reference is the unique level that is no longer there
ref = next(
if ref_level is not None:
if len(ref_level) != 2:
raise KeyError("The reference level should contain 2 strings.")
if ref_level[1] not in metadata[ref_level[0]].values:
raise KeyError(
f"The metadata data should contain a '{ref_level[0]}' column"
f" with a '{ref_level[1]}' level."
)

# Check that the reference level is not in the matrix (if unexpanded design)
ref_level_name = "_".join(ref_level)
if (not expanded) and ref_level_name in design_matrix.columns:
# Remove the reference level and add one
factor_cols = [
col for col in design_matrix.columns if col.startswith(ref_level[0])
]
missing_level = next(
level
for level in np.unique(metadata[factor])
if f"{factor}_{level}" not in design_matrix.columns
for level in np.unique(metadata[ref_level[0]])
if f"{ref_level[0]}_{level}" not in design_matrix.columns
)
else:
# The reference level is given as an argument
ref = ref_level[1]
design_matrix.columns = [
f"{col}_vs_{ref}" if col.startswith(factor) else col
for col in design_matrix.columns
]
design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[
factor_cols
].sum(1)
design_matrix.drop(ref_level_name, axis="columns", inplace=True)

if not expanded:
# Add reference level as column name suffix
for factor in design_factors:
if ref_level is None or factor != ref_level[0]:
# The reference is the unique level that is no longer there
ref = next(
level
for level in np.unique(metadata[factor])
if f"{factor}_{level}" not in design_matrix.columns
)
else:
# The reference level is given as an argument
ref = ref_level[1]
design_matrix.columns = [
f"{col}_vs_{ref}" if col.startswith(factor) else col
for col in design_matrix.columns
]
else:
# There is no categorical factor in the design
design_matrix = pd.DataFrame(index=metadata.index)

if intercept:
design_matrix.insert(0, "intercept", 1)
return design_matrix.astype("int")

# Convert categorical factors one-hot encodings to int
design_matrix = design_matrix.astype("int")

# Add continuous factors
if continuous_factors is not None:
for factor in continuous_factors:
# This factor should be numeric
design_matrix[factor] = pd.to_numeric(metadata[factor])
return design_matrix


def dispersion_trend(
Expand Down
11 changes: 11 additions & 0 deletions tests/data/continuous/r_test_dispersions.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"","x"
"gene1",0.803305296277605
"gene2",0.309446516824441
"gene3",0.947384639879212
"gene4",0.198295272212425
"gene5",0.360696307377574
"gene6",0.957229132901809
"gene7",0.191929119705575
"gene8",0.157886544572415
"gene9",0.207328182997571
"gene10",0.441693084103809
11 changes: 11 additions & 0 deletions tests/data/continuous/r_test_lfc_shrink_res.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"","baseMean","log2FoldChange","lfcSE","pvalue","padj"
"gene1",11.4990986515296,0.835509482517917,0.157553947327622,8.25559631768318e-08,4.12779815884159e-07
"gene2",28.9837352301858,-0.0301107442738015,0.102022807763966,0.742316082840818,0.742316082840818
"gene3",4.74082837294053,-0.352117952696066,0.188981895003229,0.0226591916785106,0.0347888573631709
"gene4",75.0937579876887,-0.0588919361498486,0.074420032247466,0.433553338785707,0.481725931984118
"gene5",37.4240064046328,-0.883678945485196,0.118580519296578,1.56161160353892e-16,1.56161160353892e-15
"gene6",5.27506660897539,-0.637346372518429,0.184631706038799,0.000226126083530985,0.000565315208827462
"gene7",29.9402634058322,-0.189385067707883,0.0802987178804772,0.0144463857495869,0.0288927714991737
"gene8",41.0194382235228,0.0700290485828494,0.0685197558992755,0.318344050056132,0.397930062570165
"gene9",37.5539636382544,0.411929596927421,0.0873415168121225,2.20641474529121e-07,7.35471581763738e-07
"gene10",14.9383896722216,0.243314017901976,0.130284023976124,0.0243522001542196,0.0347888573631709
Loading