owkin · BorisMuzellec · Aug 7, 2023 · Jul 31, 2023 · Aug 1, 2023 · Aug 1, 2023
@@ -13,8 +13,9 @@ It aims to facilitate DEA experiments for python users.
 As PyDESeq2 is a re-implementation of [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) from 
 scratch, you may experience some differences in terms of retrieved values or available features.
 
-Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor 
-and n-level multi-factor analysis (with categorical factors), but we plan to implement more in the future.
+Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor and 
+multi-factor analysis (with categorical or continuous factors) using Wald tests.
+We plan to implement more in the future.
 In case there is a feature you would particularly like to be implemented, feel free to open an issue.
 
 ## Table of Contents
@@ -129,8 +130,9 @@ Here are some of the features and improvements we plan to implement in the futur
 - [x] Variance-stabilizing transformation
 - [ ] Improving multi-factor analysis:
   * [x] Allowing n-level factors
+  * [x] Support for continuous covariates
   * [ ] Implementing interaction terms
-  * [ ] Support for continuous covariates
+
 
 ## Citing this work
 

@@ -68,7 +68,11 @@ class DeseqDataSet(ad.AnnData):
 
     design_factors : str or list
         Name of the columns of metadata to be used as design variables.
-        Only categorial factors are supported. (default: ``'condition'``).
+        (default: ``'condition'``).
+
+    continuous_factors : list or None
+        An optional list of continuous (as opposed to categorical) factors. Any factor
+        not in ``continuous_factors`` will be considered categorical (default: ``None``).
 
     ref_level : list or None
         An optional list of two strings of the form ``["factor", "test_level"]``
@@ -170,6 +174,7 @@ def __init__(
         counts: Optional[pd.DataFrame] = None,
         metadata: Optional[pd.DataFrame] = None,
         design_factors: Union[str, List[str]] = "condition",
+        continuous_factors: Optional[List[str]] = None,
         ref_level: Optional[List[str]] = None,
         min_mu: float = 0.5,
         min_disp: float = 1e-8,
@@ -209,6 +214,8 @@ def __init__(
         self.design_factors = (
             [design_factors] if isinstance(design_factors, str) else design_factors
         )
+        self.continuous_factors = continuous_factors
+
         if self.obs[self.design_factors].isna().any().any():
             raise ValueError("NaNs are not allowed in the design factors.")
         self.obs[self.design_factors] = self.obs[self.design_factors].astype(str)
@@ -218,6 +225,7 @@ def __init__(
         self.obsm["design_matrix"] = build_design_matrix(
             metadata=self.obs,
             design_factors=self.design_factors,
+            continuous_factors=self.continuous_factors,
             ref_level=ref_level,
             expanded=False,
             intercept=True,

@@ -40,7 +40,10 @@ class DeseqStats:
         ``['variable_of_interest', 'tested_level', 'ref_level']``.
         Names must correspond to the metadata data passed to the DeseqDataSet.
         E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B' compared
-        to 'condition A'. If None, the last variable from the design matrix is chosen
+        to 'condition A'.
+        For continuous variables, the last two strings should be left empty, e.g.
+        ``['measurement', '', ''].
+        If None, the last variable from the design matrix is chosen
         as the variable of interest, and the reference level is picked alphabetically.
         (default: ``None``).
 
@@ -206,11 +209,14 @@ def summary(self) -> None:
         self.results_df["pvalue"] = self.p_values
         self.results_df["padj"] = self.padj
 
-        if not self.quiet:
+        if self.contrast[1] == self.contrast[2] == "":
+            # The factor is continuous
+            print(f"Log2 fold change & Wald test p-value: " f"{self.contrast[0]}")
+        else:
+            # The factor is categorical
             print(
                 f"Log2 fold change & Wald test p-value: "
-                f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}",
-                file=sys.stderr,
+                f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}"
             )
         display(self.results_df)
 
@@ -298,7 +304,14 @@ def lfc_shrink(self, coeff: Optional[str] = None) -> None:
             (default: ``None``).
         """
 
-        contrast_level = f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}"
+        if self.contrast[1] == self.contrast[2] == "":
+            # The factor being tested is continuous
+            contrast_level = self.contrast[0]
+        else:
+            # The factor being tested is categorical
+            contrast_level = (
+                f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}"
+            )
 
         if coeff is not None:
             if coeff not in self.LFC.columns:
@@ -400,17 +413,21 @@ def lfc_shrink(self, coeff: Optional[str] = None) -> None:
         if hasattr(self, "results_df"):
             self.results_df["log2FoldChange"] = self.LFC.iloc[:, coeff_idx] / np.log(2)
             self.results_df["lfcSE"] = self.SE / np.log(2)
-
-            # Get the corrresponding factor, tested and reference levels of the shrunk
+            # Get the corresponding factor, tested and reference levels of the shrunk
             # coefficient
             split_coeff = coeff.split("_")
-            # coeffs are of the form "factor_A_vs_B", hence "factor" is split_coeff[0],
-            # "A" is split_coeff[1] and "B" split_coeff[3]
-            if not self.quiet:
+            # Categorical coeffs are of the form "factor_A_vs_B", and continuous coeffs
+            # of the form "factor".
+            if len(split_coeff) == 1:
+                # The factor is continuous
+                print(f"Shrunk log2 fold change & Wald test p-value: " f"{coeff}")
+            else:
+                # The factor is categorical
+                # Categorical coeffs are of the form "factor_A_vs_B", hence "factor"
+                # is split_coeff[0], "A" is split_coeff[1] and "B" split_coeff[3]
                 print(
-                    f"Shrunk Log2 fold change & Wald test p-value: "
-                    f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}",
-                    file=sys.stderr,
+                    f"Shrunk log2 fold change & Wald test p-value: "
+                    f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}"
                 )
 
             display(self.results_df)
@@ -613,7 +630,10 @@ def _build_contrast(self, contrast: Optional[List[str]] = None) -> None:
         ``['variable_of_interest', 'tested_level', 'reference_level']``.
         Names must correspond to the metadata data passed to the DeseqDataSet.
         E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B'
-        compared to 'condition A'. If None, the last variable from the design matrix
+        compared to 'condition A'.
+        For continuous variables, the last two strings will be left empty, e.g.
+        ``['measurement', '', ''].
+        If None, the last variable from the design matrix
         is chosen as the variable of interest, and the reference level is picked
         alphabetically.
 
@@ -633,26 +653,38 @@ def _build_contrast(self, contrast: Optional[List[str]] = None) -> None:
                     f"The contrast variable ('{contrast[0]}') should be one "
                     f"of the design factors."
                 )
-            if contrast[1] not in self.dds.obs[contrast[0]].values:
-                raise KeyError(
-                    f"The tested level ('{contrast[1]}') should correspond to one of the"
-                    f" levels of '{contrast[0]}'"
-                )
-            if contrast[2] not in self.dds.obs[contrast[0]].values:
-                raise KeyError(
-                    f"The reference level ('{contrast[2]}') should correspond to one of"
-                    f" the levels of '{contrast[0]}'"
-                )
+            if not (contrast[1] == contrast[2] == ""):
+                # The contrast factor is categorical, so we should check that the tested
+                # and reference levels are valid.
+                if contrast[1] not in self.dds.obs[contrast[0]].values:
+                    raise KeyError(
+                        f"The tested level ('{contrast[1]}') should correspond to "
+                        f"one of the levels of '{contrast[0]}'"
+                    )
+                if contrast[2] not in self.dds.obs[contrast[0]].values:
+                    raise KeyError(
+                        f"The reference level ('{contrast[2]}') should correspond to "
+                        f"one of the levels of '{contrast[0]}'"
+                    )
             self.contrast = contrast
         else:  # Build contrast if None
             factor = self.dds.design_factors[-1]
-            factor_col = next(
-                col
-                for col in self.dds.obsm["design_matrix"].columns
-                if col.startswith(factor)
-            )
-            split_col = factor_col.split("_")
-            self.contrast = [split_col[0], split_col[1], split_col[-1]]
+            # Check whether this factor is categorical or continuous.
+            if (
+                self.dds.continuous_factors is not None
+                and factor in self.dds.continuous_factors
+            ):
+                # The factor is continuous
+                self.contrast = [factor, "", ""]
+            else:
+                # The factor is categorical
+                factor_col = next(
+                    col
+                    for col in self.dds.obsm["design_matrix"].columns
+                    if col.startswith(factor)
+                )
+                split_col = factor_col.split("_")
+                self.contrast = [split_col[0], split_col[1], split_col[-1]]
 
     def _build_contrast_vector(self) -> None:
         """
@@ -663,7 +695,11 @@ def _build_contrast_vector(self) -> None:
         factor = self.contrast[0]
         alternative = self.contrast[1]
         ref = self.contrast[2]
-        contrast_level = f"{factor}_{alternative}_vs_{ref}"
+        if ref == alternative == "":
+            # "factor" is a continuous variable
+            contrast_level = factor
+        else:
+            contrast_level = f"{factor}_{alternative}_vs_{ref}"
 
         self.contrast_vector = np.zeros(self.LFC.shape[-1])
         if contrast_level in self.design_matrix.columns:

@@ -141,14 +141,13 @@ def build_design_matrix(
     metadata: pd.DataFrame,
     design_factors: Union[str, List[str]] = "condition",
     ref_level: Optional[List[str]] = None,
+    continuous_factors: Optional[List[str]] = None,
     expanded: bool = False,
     intercept: bool = True,
 ) -> pd.DataFrame:
     """Build design_matrix matrix for DEA.
 
-    Only single factor, 2-level designs are currently supported.
     Unless specified, the reference factor is chosen alphabetically.
-    NOTE: For now, each factor should have exactly two levels.
 
     Parameters
     ----------
@@ -160,13 +159,18 @@ def build_design_matrix(
         Name of the columns of metadata to be used as design_matrix variables.
         (default: ``"condition"``).
 
-    ref_level : list or None
+    ref_level : dict or None
         An optional list of two strings of the form ``["factor", "ref_level"]``
         specifying the factor of interest and the desired reference level, e.g.
         ``["condition", "A"]``. (default: ``None``).
 
+    continuous_factors : list or None
+        An optional list of continuous (as opposed to categorical) factors. Any factor
+        not in ``continuous_factors`` will be considered categorical (default: ``None``).
+
     expanded : bool
-        If true, use one column per category. Else, use a single column.
+        If true, use one column per category. Else, use n-1 columns, for each n-level
+        categorical factor.
         (default: ``False``).
 
     intercept : bool
@@ -192,55 +196,78 @@ def build_design_matrix(
                 f"takes the single value '{np.unique(metadata[factor])}'."
             )
 
-    design_matrix = pd.get_dummies(metadata[design_factors], drop_first=not expanded)
+    if continuous_factors is not None:
+        categorical_factors = [
+            factor for factor in design_factors if factor not in continuous_factors
+        ]
+    else:
+        categorical_factors = design_factors
 
-    if ref_level is not None:
-        if len(ref_level) != 2:
-            raise KeyError("The reference level should contain 2 strings.")
-        if ref_level[1] not in metadata[ref_level[0]].values:
-            raise KeyError(
-                f"The metadata data should contain a '{ref_level[0]}' column"
-                f" with a '{ref_level[1]}' level."
-            )
+    # Check that there is at least one categorical factor
+    if len(categorical_factors) > 0:
+        design_matrix = pd.get_dummies(
+            metadata[categorical_factors], drop_first=not expanded
+        )
 
-        # Check that the reference level is not in the matrix (if unexpanded design)
-        ref_level_name = "_".join(ref_level)
-        if (not expanded) and ref_level_name in design_matrix.columns:
-            # Remove the reference level and add one
-            factor_cols = [
-                col for col in design_matrix.columns if col.startswith(ref_level[0])
-            ]
-            missing_level = next(
-                level
-                for level in np.unique(metadata[ref_level[0]])
-                if f"{ref_level[0]}_{level}" not in design_matrix.columns
-            )
-            design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[
-                factor_cols
-            ].sum(1)
-            design_matrix.drop(ref_level_name, axis="columns", inplace=True)
-
-    if not expanded:
-        # Add reference level as column name suffix
-        for factor in design_factors:
-            if ref_level is None or factor != ref_level[0]:
-                # The reference is the unique level that is no longer there
-                ref = next(
+        if ref_level is not None:
+            if len(ref_level) != 2:
+                raise KeyError("The reference level should contain 2 strings.")
+            if ref_level[1] not in metadata[ref_level[0]].values:
+                raise KeyError(
+                    f"The metadata data should contain a '{ref_level[0]}' column"
+                    f" with a '{ref_level[1]}' level."
+                )
+
+            # Check that the reference level is not in the matrix (if unexpanded design)
+            ref_level_name = "_".join(ref_level)
+            if (not expanded) and ref_level_name in design_matrix.columns:
+                # Remove the reference level and add one
+                factor_cols = [
+                    col for col in design_matrix.columns if col.startswith(ref_level[0])
+                ]
+                missing_level = next(
                     level
-                    for level in np.unique(metadata[factor])
-                    if f"{factor}_{level}" not in design_matrix.columns
+                    for level in np.unique(metadata[ref_level[0]])
+                    if f"{ref_level[0]}_{level}" not in design_matrix.columns
                 )
-            else:
-                # The reference level is given as an argument
-                ref = ref_level[1]
-            design_matrix.columns = [
-                f"{col}_vs_{ref}" if col.startswith(factor) else col
-                for col in design_matrix.columns
-            ]
+                design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[
+                    factor_cols
+                ].sum(1)
+                design_matrix.drop(ref_level_name, axis="columns", inplace=True)
+
+        if not expanded:
+            # Add reference level as column name suffix
+            for factor in design_factors:
+                if ref_level is None or factor != ref_level[0]:
+                    # The reference is the unique level that is no longer there
+                    ref = next(
+                        level
+                        for level in np.unique(metadata[factor])
+                        if f"{factor}_{level}" not in design_matrix.columns
+                    )
+                else:
+                    # The reference level is given as an argument
+                    ref = ref_level[1]
+                design_matrix.columns = [
+                    f"{col}_vs_{ref}" if col.startswith(factor) else col
+                    for col in design_matrix.columns
+                ]
+    else:
+        # There is no categorical factor in the design
+        design_matrix = pd.DataFrame(index=metadata.index)
 
     if intercept:
         design_matrix.insert(0, "intercept", 1)
-    return design_matrix.astype("int")
+
+    # Convert categorical factors one-hot encodings to int
+    design_matrix = design_matrix.astype("int")
+
+    # Add continuous factors
+    if continuous_factors is not None:
+        for factor in continuous_factors:
+            # This factor should be numeric
+            design_matrix[factor] = pd.to_numeric(metadata[factor])
+    return design_matrix
 
 
 def dispersion_trend(

@@ -0,0 +1,11 @@
+"","x"
+"gene1",0.803305296277605
+"gene2",0.309446516824441
+"gene3",0.947384639879212
+"gene4",0.198295272212425
+"gene5",0.360696307377574
+"gene6",0.957229132901809
+"gene7",0.191929119705575
+"gene8",0.157886544572415
+"gene9",0.207328182997571
+"gene10",0.441693084103809
@@ -0,0 +1,11 @@
+"","baseMean","log2FoldChange","lfcSE","pvalue","padj"
+"gene1",11.4990986515296,0.835509482517917,0.157553947327622,8.25559631768318e-08,4.12779815884159e-07
+"gene2",28.9837352301858,-0.0301107442738015,0.102022807763966,0.742316082840818,0.742316082840818
+"gene3",4.74082837294053,-0.352117952696066,0.188981895003229,0.0226591916785106,0.0347888573631709
+"gene4",75.0937579876887,-0.0588919361498486,0.074420032247466,0.433553338785707,0.481725931984118
+"gene5",37.4240064046328,-0.883678945485196,0.118580519296578,1.56161160353892e-16,1.56161160353892e-15
+"gene6",5.27506660897539,-0.637346372518429,0.184631706038799,0.000226126083530985,0.000565315208827462
+"gene7",29.9402634058322,-0.189385067707883,0.0802987178804772,0.0144463857495869,0.0288927714991737
+"gene8",41.0194382235228,0.0700290485828494,0.0685197558992755,0.318344050056132,0.397930062570165
+"gene9",37.5539636382544,0.411929596927421,0.0873415168121225,2.20641474529121e-07,7.35471581763738e-07
+"gene10",14.9383896722216,0.243314017901976,0.130284023976124,0.0243522001542196,0.0347888573631709