diff --git a/README.md b/README.md index 1d918857..25e9f865 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ It aims to facilitate DEA experiments for python users. As PyDESeq2 is a re-implementation of [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) from scratch, you may experience some differences in terms of retrieved values or available features. -Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor -and n-level multi-factor analysis (with categorical factors), but we plan to implement more in the future. +Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor and +multi-factor analysis (with categorical or continuous factors) using Wald tests. +We plan to implement more in the future. In case there is a feature you would particularly like to be implemented, feel free to open an issue. ## Table of Contents @@ -129,8 +130,9 @@ Here are some of the features and improvements we plan to implement in the futur - [x] Variance-stabilizing transformation - [ ] Improving multi-factor analysis: * [x] Allowing n-level factors + * [x] Support for continuous covariates * [ ] Implementing interaction terms - * [ ] Support for continuous covariates + ## Citing this work diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py index 8a8de4c4..0568e895 100644 --- a/pydeseq2/dds.py +++ b/pydeseq2/dds.py @@ -68,7 +68,11 @@ class DeseqDataSet(ad.AnnData): design_factors : str or list Name of the columns of metadata to be used as design variables. - Only categorial factors are supported. (default: ``'condition'``). + (default: ``'condition'``). + + continuous_factors : list or None + An optional list of continuous (as opposed to categorical) factors. Any factor + not in ``continuous_factors`` will be considered categorical (default: ``None``). ref_level : list or None An optional list of two strings of the form ``["factor", "test_level"]`` @@ -170,6 +174,7 @@ def __init__( counts: Optional[pd.DataFrame] = None, metadata: Optional[pd.DataFrame] = None, design_factors: Union[str, List[str]] = "condition", + continuous_factors: Optional[List[str]] = None, ref_level: Optional[List[str]] = None, min_mu: float = 0.5, min_disp: float = 1e-8, @@ -209,6 +214,8 @@ def __init__( self.design_factors = ( [design_factors] if isinstance(design_factors, str) else design_factors ) + self.continuous_factors = continuous_factors + if self.obs[self.design_factors].isna().any().any(): raise ValueError("NaNs are not allowed in the design factors.") self.obs[self.design_factors] = self.obs[self.design_factors].astype(str) @@ -218,6 +225,7 @@ def __init__( self.obsm["design_matrix"] = build_design_matrix( metadata=self.obs, design_factors=self.design_factors, + continuous_factors=self.continuous_factors, ref_level=ref_level, expanded=False, intercept=True, diff --git a/pydeseq2/ds.py b/pydeseq2/ds.py index 71464476..079ecb14 100644 --- a/pydeseq2/ds.py +++ b/pydeseq2/ds.py @@ -40,7 +40,10 @@ class DeseqStats: ``['variable_of_interest', 'tested_level', 'ref_level']``. Names must correspond to the metadata data passed to the DeseqDataSet. E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B' compared - to 'condition A'. If None, the last variable from the design matrix is chosen + to 'condition A'. + For continuous variables, the last two strings should be left empty, e.g. + ``['measurement', '', '']. + If None, the last variable from the design matrix is chosen as the variable of interest, and the reference level is picked alphabetically. (default: ``None``). @@ -206,11 +209,14 @@ def summary(self) -> None: self.results_df["pvalue"] = self.p_values self.results_df["padj"] = self.padj - if not self.quiet: + if self.contrast[1] == self.contrast[2] == "": + # The factor is continuous + print(f"Log2 fold change & Wald test p-value: " f"{self.contrast[0]}") + else: + # The factor is categorical print( f"Log2 fold change & Wald test p-value: " - f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}", - file=sys.stderr, + f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}" ) display(self.results_df) @@ -298,7 +304,14 @@ def lfc_shrink(self, coeff: Optional[str] = None) -> None: (default: ``None``). """ - contrast_level = f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}" + if self.contrast[1] == self.contrast[2] == "": + # The factor being tested is continuous + contrast_level = self.contrast[0] + else: + # The factor being tested is categorical + contrast_level = ( + f"{self.contrast[0]}_{self.contrast[1]}_vs_{self.contrast[2]}" + ) if coeff is not None: if coeff not in self.LFC.columns: @@ -400,17 +413,21 @@ def lfc_shrink(self, coeff: Optional[str] = None) -> None: if hasattr(self, "results_df"): self.results_df["log2FoldChange"] = self.LFC.iloc[:, coeff_idx] / np.log(2) self.results_df["lfcSE"] = self.SE / np.log(2) - - # Get the corrresponding factor, tested and reference levels of the shrunk + # Get the corresponding factor, tested and reference levels of the shrunk # coefficient split_coeff = coeff.split("_") - # coeffs are of the form "factor_A_vs_B", hence "factor" is split_coeff[0], - # "A" is split_coeff[1] and "B" split_coeff[3] - if not self.quiet: + # Categorical coeffs are of the form "factor_A_vs_B", and continuous coeffs + # of the form "factor". + if len(split_coeff) == 1: + # The factor is continuous + print(f"Shrunk log2 fold change & Wald test p-value: " f"{coeff}") + else: + # The factor is categorical + # Categorical coeffs are of the form "factor_A_vs_B", hence "factor" + # is split_coeff[0], "A" is split_coeff[1] and "B" split_coeff[3] print( - f"Shrunk Log2 fold change & Wald test p-value: " - f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}", - file=sys.stderr, + f"Shrunk log2 fold change & Wald test p-value: " + f"{split_coeff[0]} {split_coeff[1]} vs {split_coeff[3]}" ) display(self.results_df) @@ -613,7 +630,10 @@ def _build_contrast(self, contrast: Optional[List[str]] = None) -> None: ``['variable_of_interest', 'tested_level', 'reference_level']``. Names must correspond to the metadata data passed to the DeseqDataSet. E.g., ``['condition', 'B', 'A']`` will measure the LFC of 'condition B' - compared to 'condition A'. If None, the last variable from the design matrix + compared to 'condition A'. + For continuous variables, the last two strings will be left empty, e.g. + ``['measurement', '', '']. + If None, the last variable from the design matrix is chosen as the variable of interest, and the reference level is picked alphabetically. @@ -633,26 +653,38 @@ def _build_contrast(self, contrast: Optional[List[str]] = None) -> None: f"The contrast variable ('{contrast[0]}') should be one " f"of the design factors." ) - if contrast[1] not in self.dds.obs[contrast[0]].values: - raise KeyError( - f"The tested level ('{contrast[1]}') should correspond to one of the" - f" levels of '{contrast[0]}'" - ) - if contrast[2] not in self.dds.obs[contrast[0]].values: - raise KeyError( - f"The reference level ('{contrast[2]}') should correspond to one of" - f" the levels of '{contrast[0]}'" - ) + if not (contrast[1] == contrast[2] == ""): + # The contrast factor is categorical, so we should check that the tested + # and reference levels are valid. + if contrast[1] not in self.dds.obs[contrast[0]].values: + raise KeyError( + f"The tested level ('{contrast[1]}') should correspond to " + f"one of the levels of '{contrast[0]}'" + ) + if contrast[2] not in self.dds.obs[contrast[0]].values: + raise KeyError( + f"The reference level ('{contrast[2]}') should correspond to " + f"one of the levels of '{contrast[0]}'" + ) self.contrast = contrast else: # Build contrast if None factor = self.dds.design_factors[-1] - factor_col = next( - col - for col in self.dds.obsm["design_matrix"].columns - if col.startswith(factor) - ) - split_col = factor_col.split("_") - self.contrast = [split_col[0], split_col[1], split_col[-1]] + # Check whether this factor is categorical or continuous. + if ( + self.dds.continuous_factors is not None + and factor in self.dds.continuous_factors + ): + # The factor is continuous + self.contrast = [factor, "", ""] + else: + # The factor is categorical + factor_col = next( + col + for col in self.dds.obsm["design_matrix"].columns + if col.startswith(factor) + ) + split_col = factor_col.split("_") + self.contrast = [split_col[0], split_col[1], split_col[-1]] def _build_contrast_vector(self) -> None: """ @@ -663,7 +695,11 @@ def _build_contrast_vector(self) -> None: factor = self.contrast[0] alternative = self.contrast[1] ref = self.contrast[2] - contrast_level = f"{factor}_{alternative}_vs_{ref}" + if ref == alternative == "": + # "factor" is a continuous variable + contrast_level = factor + else: + contrast_level = f"{factor}_{alternative}_vs_{ref}" self.contrast_vector = np.zeros(self.LFC.shape[-1]) if contrast_level in self.design_matrix.columns: diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index e1868849..a328ec01 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -141,14 +141,13 @@ def build_design_matrix( metadata: pd.DataFrame, design_factors: Union[str, List[str]] = "condition", ref_level: Optional[List[str]] = None, + continuous_factors: Optional[List[str]] = None, expanded: bool = False, intercept: bool = True, ) -> pd.DataFrame: """Build design_matrix matrix for DEA. - Only single factor, 2-level designs are currently supported. Unless specified, the reference factor is chosen alphabetically. - NOTE: For now, each factor should have exactly two levels. Parameters ---------- @@ -160,13 +159,18 @@ def build_design_matrix( Name of the columns of metadata to be used as design_matrix variables. (default: ``"condition"``). - ref_level : list or None + ref_level : dict or None An optional list of two strings of the form ``["factor", "ref_level"]`` specifying the factor of interest and the desired reference level, e.g. ``["condition", "A"]``. (default: ``None``). + continuous_factors : list or None + An optional list of continuous (as opposed to categorical) factors. Any factor + not in ``continuous_factors`` will be considered categorical (default: ``None``). + expanded : bool - If true, use one column per category. Else, use a single column. + If true, use one column per category. Else, use n-1 columns, for each n-level + categorical factor. (default: ``False``). intercept : bool @@ -192,55 +196,78 @@ def build_design_matrix( f"takes the single value '{np.unique(metadata[factor])}'." ) - design_matrix = pd.get_dummies(metadata[design_factors], drop_first=not expanded) + if continuous_factors is not None: + categorical_factors = [ + factor for factor in design_factors if factor not in continuous_factors + ] + else: + categorical_factors = design_factors - if ref_level is not None: - if len(ref_level) != 2: - raise KeyError("The reference level should contain 2 strings.") - if ref_level[1] not in metadata[ref_level[0]].values: - raise KeyError( - f"The metadata data should contain a '{ref_level[0]}' column" - f" with a '{ref_level[1]}' level." - ) + # Check that there is at least one categorical factor + if len(categorical_factors) > 0: + design_matrix = pd.get_dummies( + metadata[categorical_factors], drop_first=not expanded + ) - # Check that the reference level is not in the matrix (if unexpanded design) - ref_level_name = "_".join(ref_level) - if (not expanded) and ref_level_name in design_matrix.columns: - # Remove the reference level and add one - factor_cols = [ - col for col in design_matrix.columns if col.startswith(ref_level[0]) - ] - missing_level = next( - level - for level in np.unique(metadata[ref_level[0]]) - if f"{ref_level[0]}_{level}" not in design_matrix.columns - ) - design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[ - factor_cols - ].sum(1) - design_matrix.drop(ref_level_name, axis="columns", inplace=True) - - if not expanded: - # Add reference level as column name suffix - for factor in design_factors: - if ref_level is None or factor != ref_level[0]: - # The reference is the unique level that is no longer there - ref = next( + if ref_level is not None: + if len(ref_level) != 2: + raise KeyError("The reference level should contain 2 strings.") + if ref_level[1] not in metadata[ref_level[0]].values: + raise KeyError( + f"The metadata data should contain a '{ref_level[0]}' column" + f" with a '{ref_level[1]}' level." + ) + + # Check that the reference level is not in the matrix (if unexpanded design) + ref_level_name = "_".join(ref_level) + if (not expanded) and ref_level_name in design_matrix.columns: + # Remove the reference level and add one + factor_cols = [ + col for col in design_matrix.columns if col.startswith(ref_level[0]) + ] + missing_level = next( level - for level in np.unique(metadata[factor]) - if f"{factor}_{level}" not in design_matrix.columns + for level in np.unique(metadata[ref_level[0]]) + if f"{ref_level[0]}_{level}" not in design_matrix.columns ) - else: - # The reference level is given as an argument - ref = ref_level[1] - design_matrix.columns = [ - f"{col}_vs_{ref}" if col.startswith(factor) else col - for col in design_matrix.columns - ] + design_matrix[f"{ref_level[0]}_{missing_level}"] = 1 - design_matrix[ + factor_cols + ].sum(1) + design_matrix.drop(ref_level_name, axis="columns", inplace=True) + + if not expanded: + # Add reference level as column name suffix + for factor in design_factors: + if ref_level is None or factor != ref_level[0]: + # The reference is the unique level that is no longer there + ref = next( + level + for level in np.unique(metadata[factor]) + if f"{factor}_{level}" not in design_matrix.columns + ) + else: + # The reference level is given as an argument + ref = ref_level[1] + design_matrix.columns = [ + f"{col}_vs_{ref}" if col.startswith(factor) else col + for col in design_matrix.columns + ] + else: + # There is no categorical factor in the design + design_matrix = pd.DataFrame(index=metadata.index) if intercept: design_matrix.insert(0, "intercept", 1) - return design_matrix.astype("int") + + # Convert categorical factors one-hot encodings to int + design_matrix = design_matrix.astype("int") + + # Add continuous factors + if continuous_factors is not None: + for factor in continuous_factors: + # This factor should be numeric + design_matrix[factor] = pd.to_numeric(metadata[factor]) + return design_matrix def dispersion_trend( diff --git a/tests/data/continuous/r_test_dispersions.csv b/tests/data/continuous/r_test_dispersions.csv new file mode 100644 index 00000000..de790d36 --- /dev/null +++ b/tests/data/continuous/r_test_dispersions.csv @@ -0,0 +1,11 @@ +"","x" +"gene1",0.803305296277605 +"gene2",0.309446516824441 +"gene3",0.947384639879212 +"gene4",0.198295272212425 +"gene5",0.360696307377574 +"gene6",0.957229132901809 +"gene7",0.191929119705575 +"gene8",0.157886544572415 +"gene9",0.207328182997571 +"gene10",0.441693084103809 diff --git a/tests/data/continuous/r_test_lfc_shrink_res.csv b/tests/data/continuous/r_test_lfc_shrink_res.csv new file mode 100644 index 00000000..7ae8c786 --- /dev/null +++ b/tests/data/continuous/r_test_lfc_shrink_res.csv @@ -0,0 +1,11 @@ +"","baseMean","log2FoldChange","lfcSE","pvalue","padj" +"gene1",11.4990986515296,0.835509482517917,0.157553947327622,8.25559631768318e-08,4.12779815884159e-07 +"gene2",28.9837352301858,-0.0301107442738015,0.102022807763966,0.742316082840818,0.742316082840818 +"gene3",4.74082837294053,-0.352117952696066,0.188981895003229,0.0226591916785106,0.0347888573631709 +"gene4",75.0937579876887,-0.0588919361498486,0.074420032247466,0.433553338785707,0.481725931984118 +"gene5",37.4240064046328,-0.883678945485196,0.118580519296578,1.56161160353892e-16,1.56161160353892e-15 +"gene6",5.27506660897539,-0.637346372518429,0.184631706038799,0.000226126083530985,0.000565315208827462 +"gene7",29.9402634058322,-0.189385067707883,0.0802987178804772,0.0144463857495869,0.0288927714991737 +"gene8",41.0194382235228,0.0700290485828494,0.0685197558992755,0.318344050056132,0.397930062570165 +"gene9",37.5539636382544,0.411929596927421,0.0873415168121225,2.20641474529121e-07,7.35471581763738e-07 +"gene10",14.9383896722216,0.243314017901976,0.130284023976124,0.0243522001542196,0.0347888573631709 diff --git a/tests/data/continuous/r_test_res.csv b/tests/data/continuous/r_test_res.csv new file mode 100644 index 00000000..c28e5515 --- /dev/null +++ b/tests/data/continuous/r_test_res.csv @@ -0,0 +1,11 @@ +"","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj" +"gene1",11.4990986515296,0.879184485412623,0.163982554356914,5.36145133767734,8.25559631768318e-08,4.12779815884159e-07 +"gene2",28.9837352301858,-0.0331071520979685,0.10069458720744,-0.328787802960698,0.742316082840818,0.742316082840818 +"gene3",4.74082837294053,-0.427669614530459,0.187645790467605,-2.27913247328771,0.0226591916785106,0.0347888573631709 +"gene4",75.0937579876887,-0.0621031752060697,0.079301675778813,-0.783125635065858,0.433553338785707,0.481725931984118 +"gene5",37.4240064046328,-0.90812315615379,0.110052898691485,-8.25169683807747,1.56161160353892e-16,1.56161160353892e-15 +"gene6",5.27506660897539,-0.703677055092623,0.190807770853154,-3.68788468072494,0.000226126083530985,0.000565315208827462 +"gene7",29.9402634058322,-0.199067763482842,0.0813860895667706,-2.4459679110092,0.0144463857495869,0.0288927714991737 +"gene8",41.0194382235228,0.0726617273239344,0.0728170753027441,0.997866599583081,0.318344050056132,0.397930062570165 +"gene9",37.5539636382544,0.427958995592051,0.0826008386878115,5.18104903522244,2.20641474529121e-07,7.35471581763738e-07 +"gene10",14.9383896722216,0.273941360551479,0.121669161267808,2.25152666211369,0.0243522001542196,0.0347888573631709 diff --git a/tests/data/continuous/r_test_size_factors.csv b/tests/data/continuous/r_test_size_factors.csv new file mode 100644 index 00000000..2373e853 --- /dev/null +++ b/tests/data/continuous/r_test_size_factors.csv @@ -0,0 +1,101 @@ +"","x" +"sample1",1.386582232701 +"sample2",1.16764819595874 +"sample3",1.42481177838213 +"sample4",1.00384466204196 +"sample5",1.20556595350783 +"sample6",1.08479987672276 +"sample7",1.54626937515134 +"sample8",1.44100282131829 +"sample9",0.888492719016284 +"sample10",1.31360422045358 +"sample11",0.888492719016284 +"sample12",1.35009322657729 +"sample13",0.943486398397429 +"sample14",1.9131807523059 +"sample15",0.766269128597921 +"sample16",2.1752603074163 +"sample17",0.693291116350499 +"sample18",0.874316318552671 +"sample19",1.56902726331955 +"sample20",0.911222902622349 +"sample21",1.53188399830394 +"sample22",1.16423183871099 +"sample23",0.693291116350499 +"sample24",1.12694208697471 +"sample25",0.707614798798072 +"sample26",1.00384466204196 +"sample27",1.6511011971955 +"sample28",1.38853013732929 +"sample29",0.995902309419508 +"sample30",0.72978012247421 +"sample31",0.922889447361153 +"sample32",0.890507361488832 +"sample33",1.99601207241086 +"sample34",1.08479987672276 +"sample35",1.10073413146367 +"sample36",2.04338434292779 +"sample37",1.02211026493055 +"sample38",1.45956024494842 +"sample39",0.875736146969052 +"sample40",1.12818073658005 +"sample41",1.35009322657729 +"sample42",1.25835543695467 +"sample43",1.04139760299697 +"sample44",0.760979017999547 +"sample45",1.36281368657406 +"sample46",1.77698543803257 +"sample47",0.99800603620543 +"sample48",0.858125275616511 +"sample49",1.49385346412926 +"sample50",1.21496387016313 +"sample51",0.917278442886389 +"sample52",0.583824097979368 +"sample53",0.825743189744189 +"sample54",1.33660573106302 +"sample55",0.89107048737535 +"sample56",0.712405889191065 +"sample57",0.83865457635327 +"sample58",1.19486951867707 +"sample59",0.995902309419508 +"sample60",1.80762311799865 +"sample61",0.793361103871868 +"sample62",0.949768078948441 +"sample63",0.99800603620543 +"sample64",1.37869559847354 +"sample65",1.07452617595263 +"sample66",0.602782976753913 +"sample67",0.607481935081566 +"sample68",0.534304416893299 +"sample69",0.607481935081566 +"sample70",1.70351710821758 +"sample71",1.42481177838213 +"sample72",0.728596932127226 +"sample73",1.02211026493055 +"sample74",0.824439769039268 +"sample75",0.707614798798072 +"sample76",0.707614798798072 +"sample77",1.35009322657729 +"sample78",0.655198887775992 +"sample79",0.949768078948441 +"sample80",0.777170060935708 +"sample81",0.995902309419508 +"sample82",1.27711521432987 +"sample83",0.86486253186431 +"sample84",1.43192170412083 +"sample85",0.796579679118047 +"sample86",0.98040575891452 +"sample87",0.858125275616511 +"sample88",1.56209640449545 +"sample89",1.24062620820616 +"sample90",0.948714159216473 +"sample91",0.404776073404014 +"sample92",1.10295647877883 +"sample93",0.857855039050205 +"sample94",0.760030709820151 +"sample95",1.04168111884668 +"sample96",0.707614798798072 +"sample97",1.0110434388806 +"sample98",0.694265068664647 +"sample99",0.83865457635327 +"sample100",1.12694208697471 diff --git a/tests/data/continuous/test_counts.csv b/tests/data/continuous/test_counts.csv new file mode 100644 index 00000000..34516e39 --- /dev/null +++ b/tests/data/continuous/test_counts.csv @@ -0,0 +1,11 @@ +"","sample1","sample2","sample3","sample4","sample5","sample6","sample7","sample8","sample9","sample10","sample11","sample12","sample13","sample14","sample15","sample16","sample17","sample18","sample19","sample20","sample21","sample22","sample23","sample24","sample25","sample26","sample27","sample28","sample29","sample30","sample31","sample32","sample33","sample34","sample35","sample36","sample37","sample38","sample39","sample40","sample41","sample42","sample43","sample44","sample45","sample46","sample47","sample48","sample49","sample50","sample51","sample52","sample53","sample54","sample55","sample56","sample57","sample58","sample59","sample60","sample61","sample62","sample63","sample64","sample65","sample66","sample67","sample68","sample69","sample70","sample71","sample72","sample73","sample74","sample75","sample76","sample77","sample78","sample79","sample80","sample81","sample82","sample83","sample84","sample85","sample86","sample87","sample88","sample89","sample90","sample91","sample92","sample93","sample94","sample95","sample96","sample97","sample98","sample99","sample100" +"gene1",1,4,2,5,10,4,12,12,5,14,0,1,24,5,2,4,4,2,17,3,57,0,15,2,4,18,0,42,29,83,3,1,13,0,6,9,2,3,0,13,1,24,5,2,12,39,4,3,10,7,16,17,3,5,16,29,1,16,12,4,5,19,2,15,0,58,13,18,5,20,2,31,9,11,5,24,5,2,7,4,7,22,8,7,5,15,2,8,5,5,3,1,1,2,34,18,10,7,14,7 +"gene2",5,26,18,21,11,22,38,17,11,14,11,33,10,22,16,74,13,28,20,21,19,26,17,17,14,31,4,32,20,4,30,39,46,39,25,71,14,5,11,26,20,29,24,5,13,58,23,22,6,28,38,40,17,65,45,8,9,32,15,69,63,85,23,29,8,6,14,55,14,9,20,50,11,19,43,49,47,64,22,40,49,36,24,33,17,50,31,36,27,25,42,29,29,72,35,38,38,16,29,27 +"gene3",14,0,13,4,8,10,0,0,10,6,3,8,3,0,7,1,1,4,1,3,1,7,12,4,8,1,8,1,0,1,5,8,13,5,0,1,8,4,0,8,2,4,2,16,0,1,6,5,1,2,18,3,2,3,7,3,4,4,2,0,12,11,1,6,2,0,4,6,2,1,17,3,3,3,23,8,1,3,0,3,2,9,0,2,0,1,5,13,2,6,0,4,1,6,6,2,9,4,1,1 +"gene4",231,47,88,62,233,67,120,89,71,42,122,41,203,124,186,49,91,54,103,29,284,55,70,52,154,62,146,73,69,37,57,55,196,67,125,49,101,64,58,39,259,29,193,47,183,70,172,53,145,32,46,36,51,13,91,44,58,42,84,14,49,15,17,50,139,19,76,33,64,18,88,45,96,17,58,24,107,28,72,48,32,21,119,50,105,38,53,33,77,38,25,44,63,39,109,88,128,32,94,26 +"gene5",12,19,11,3,40,9,2,18,16,22,16,35,3,126,10,4,11,78,29,72,1,47,15,42,16,80,53,0,52,6,17,68,39,64,37,38,35,14,10,42,45,18,4,54,31,28,3,89,12,14,24,34,20,52,19,31,53,142,42,62,24,76,83,40,42,12,10,25,14,9,87,113,22,99,47,27,21,79,20,70,11,20,23,30,29,15,40,49,20,84,15,68,18,70,10,62,39,51,59,37 +"gene6",0,11,8,0,1,11,4,9,7,2,0,12,1,20,2,10,11,9,0,14,1,11,0,4,2,1,10,6,0,0,5,18,2,4,1,0,1,2,0,15,11,6,6,17,0,0,0,1,5,0,6,1,3,0,5,9,2,0,18,11,0,0,0,0,0,3,1,8,1,3,12,11,1,16,2,10,2,1,5,1,7,1,8,8,3,7,2,6,20,9,5,7,2,9,4,11,4,6,6,5 +"gene7",38,32,48,17,35,86,32,17,22,36,19,37,8,55,21,33,19,15,43,14,17,33,19,46,89,27,52,18,60,20,18,80,40,16,38,56,21,40,24,31,37,48,64,28,46,53,18,17,43,44,28,16,33,51,12,20,23,28,55,55,40,32,30,45,49,32,14,13,19,51,46,14,18,23,12,19,37,42,19,12,75,35,17,26,23,38,12,45,34,26,6,40,10,13,28,16,24,17,16,11 +"gene8",47,50,66,40,46,68,59,78,56,78,55,17,36,73,52,83,22,33,109,76,81,64,14,43,27,33,63,77,38,92,73,26,36,42,42,42,39,62,24,47,57,25,33,72,52,26,31,77,57,54,35,18,34,51,34,26,32,78,38,59,29,23,16,62,41,23,21,47,19,65,41,28,39,61,27,27,9,25,26,38,38,35,33,56,15,16,29,44,19,30,14,26,24,29,35,27,16,64,32,43 +"gene9",53,45,37,107,24,32,39,113,29,51,29,64,53,27,24,143,27,32,40,67,50,38,17,93,16,87,18,60,8,33,26,23,110,18,5,72,38,48,59,30,34,44,27,7,36,58,78,19,24,38,20,100,24,42,12,55,14,39,28,59,20,31,34,45,22,66,28,15,8,103,55,10,57,21,8,38,13,13,31,9,28,45,16,64,26,32,39,60,43,46,25,36,28,25,34,21,33,46,18,55 +"gene10",3,7,12,37,7,9,21,2,18,22,12,13,11,11,3,10,18,15,16,12,10,33,24,8,5,24,6,27,23,1,14,29,18,18,1,8,10,14,8,14,9,8,9,4,7,31,21,26,20,27,0,8,32,10,13,18,25,18,7,10,8,9,6,30,13,27,47,16,6,12,17,25,18,13,9,15,7,31,10,15,3,43,29,6,11,8,8,12,13,18,10,13,8,7,17,2,8,12,13,10 diff --git a/tests/data/continuous/test_metadata.csv b/tests/data/continuous/test_metadata.csv new file mode 100644 index 00000000..b395e0c6 --- /dev/null +++ b/tests/data/continuous/test_metadata.csv @@ -0,0 +1,101 @@ +"","condition","group","measurement" +"sample1","A","X",-0.164523596253587 +"sample2","A","Y",-0.253361680136508 +"sample3","A","X",0.696963375404737 +"sample4","A","Y",0.556663198673657 +"sample5","A","X",-0.68875569454952 +"sample6","A","Y",-0.70749515696212 +"sample7","A","X",0.36458196213683 +"sample8","A","Y",0.768532924515416 +"sample9","A","X",-0.112346212150228 +"sample10","A","Y",0.881107726454215 +"sample11","A","X",0.398105880367068 +"sample12","A","Y",-0.612026393250771 +"sample13","A","X",0.341119691424425 +"sample14","A","Y",-1.12936309608079 +"sample15","A","X",1.43302370170104 +"sample16","A","Y",1.98039989850586 +"sample17","A","X",-0.367221476466509 +"sample18","A","Y",-1.04413462631653 +"sample19","A","X",0.569719627442413 +"sample20","A","Y",-0.135054603880824 +"sample21","A","X",2.40161776050478 +"sample22","A","Y",-0.0392400027331692 +"sample23","A","X",0.689739362450777 +"sample24","A","Y",0.0280021587806661 +"sample25","A","X",-0.743273208882405 +"sample26","A","Y",0.188792299514343 +"sample27","A","X",-1.80495862889104 +"sample28","A","Y",1.46555486156289 +"sample29","A","X",0.153253338211898 +"sample30","A","Y",2.17261167036215 +"sample31","A","X",0.475509528899663 +"sample32","A","Y",-0.709946430921815 +"sample33","A","X",0.610726353489055 +"sample34","A","Y",-0.934097631644252 +"sample35","A","X",-1.2536334002391 +"sample36","A","Y",0.291446235517463 +"sample37","A","X",-0.443291873218433 +"sample38","A","Y",0.00110535163162413 +"sample39","A","X",0.0743413241516641 +"sample40","A","Y",-0.589520946188072 +"sample41","A","X",-0.568668732818502 +"sample42","A","Y",-0.135178615123832 +"sample43","A","X",1.1780869965732 +"sample44","A","Y",-1.52356680042976 +"sample45","A","X",0.593946187628422 +"sample46","A","Y",0.332950371213518 +"sample47","A","X",1.06309983727636 +"sample48","A","Y",-0.304183923634301 +"sample49","A","X",0.370018809916288 +"sample50","A","Y",0.267098790772231 +"sample51","B","X",-0.54252003099165 +"sample52","B","Y",1.20786780598317 +"sample53","B","X",1.16040261569495 +"sample54","B","Y",0.700213649514998 +"sample55","B","X",1.58683345454085 +"sample56","B","Y",0.558486425565304 +"sample57","B","X",-1.27659220845804 +"sample58","B","Y",-0.573265414236886 +"sample59","B","X",-1.22461261489836 +"sample60","B","Y",-0.473400636439312 +"sample61","B","X",-0.620366677224124 +"sample62","B","Y",0.0421158731442352 +"sample63","B","X",-0.910921648552446 +"sample64","B","Y",0.158028772404075 +"sample65","B","X",-0.654584643918818 +"sample66","B","Y",1.76728726937265 +"sample67","B","X",0.716707476017206 +"sample68","B","Y",0.910174229495227 +"sample69","B","X",0.384185357826345 +"sample70","B","Y",1.68217608051942 +"sample71","B","X",-0.635736453948977 +"sample72","B","Y",-0.461644730360566 +"sample73","B","X",1.43228223854166 +"sample74","B","Y",-0.650696353310367 +"sample75","B","X",-0.207380743601965 +"sample76","B","Y",-0.392807929441984 +"sample77","B","X",-0.319992868548507 +"sample78","B","Y",-0.279113302976559 +"sample79","B","X",0.494188331267827 +"sample80","B","Y",-0.177330482269606 +"sample81","B","X",-0.505957462114257 +"sample82","B","Y",1.34303882517041 +"sample83","B","X",-0.214579408546869 +"sample84","B","Y",-0.179556530043387 +"sample85","B","X",-0.100190741213562 +"sample86","B","Y",0.712666307051405 +"sample87","B","X",-0.0735644041263263 +"sample88","B","Y",-0.0376341714670479 +"sample89","B","X",-0.681660478755657 +"sample90","B","Y",-0.324270272246319 +"sample91","B","X",0.0601604404345152 +"sample92","B","Y",-0.588894486259664 +"sample93","B","X",0.531496192632572 +"sample94","B","Y",-1.51839408178679 +"sample95","B","X",0.306557860789766 +"sample96","B","Y",-1.53644982353759 +"sample97","B","X",-0.300976126836611 +"sample98","B","Y",-0.528279904445006 +"sample99","B","X",-0.652094780680999 +"sample100","B","Y",-0.0568967778473925 diff --git a/tests/test_pydeseq2.py b/tests/test_pydeseq2.py index bd4c96c3..5681ae2d 100644 --- a/tests/test_pydeseq2.py +++ b/tests/test_pydeseq2.py @@ -293,6 +293,107 @@ def test_multifactor_lfc_shrinkage(tol=0.02): ).max() < tol +# Continuous tests +def test_continuous_deseq(tol=0.02): + """Test that the outputs of the DESeq2 function match those of the original R + package, up to a tolerance in relative error, with a continuous factor. + """ + + test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve()) + + counts_df = pd.read_csv( + os.path.join(test_path, "data/continuous/test_counts.csv"), index_col=0 + ).T + + metadata = pd.read_csv( + os.path.join(test_path, "data/continuous/test_metadata.csv"), index_col=0 + ) + + r_res = pd.read_csv( + os.path.join(test_path, "data/continuous/r_test_res.csv"), index_col=0 + ) + + dds = DeseqDataSet( + counts=counts_df, + metadata=metadata, + design_factors=["group", "condition", "measurement"], + continuous_factors=["measurement"], + ) + dds.deseq2() + + res = DeseqStats(dds) + res.summary() + res_df = res.results_df + + # check that the same p-values are NaN + assert (res_df.pvalue.isna() == r_res.pvalue.isna()).all() + assert (res_df.padj.isna() == r_res.padj.isna()).all() + + # Check that the same LFC, p-values and adjusted p-values are found (up to tol) + assert ( + abs(r_res.log2FoldChange - res_df.log2FoldChange) / abs(r_res.log2FoldChange) + ).max() < tol + assert (abs(r_res.pvalue - res_df.pvalue) / r_res.pvalue).max() < tol + assert (abs(r_res.padj - res_df.padj) / r_res.padj).max() < tol + + +def test_continuous_lfc_shrinkage(tol=0.02): + """Test that the outputs of the lfc_shrink function match those of the original + R package (starting from the same inputs), up to a tolerance in relative error. + """ + + test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve()) + + r_res = pd.read_csv( + os.path.join(test_path, "data/continuous/r_test_res.csv"), index_col=0 + ) + r_shrunk_res = pd.read_csv( + os.path.join(test_path, "data/continuous/r_test_lfc_shrink_res.csv"), + index_col=0, + ) + + counts_df = pd.read_csv( + os.path.join(test_path, "data/continuous/test_counts.csv"), index_col=0 + ).T + + metadata = pd.read_csv( + os.path.join(test_path, "data/continuous/test_metadata.csv"), index_col=0 + ) + + r_size_factors = pd.read_csv( + os.path.join(test_path, "data/continuous/r_test_size_factors.csv"), + index_col=0, + ).squeeze() + + r_dispersions = pd.read_csv( + os.path.join(test_path, "data/continuous/r_test_dispersions.csv"), index_col=0 + ).squeeze() + + dds = DeseqDataSet( + counts=counts_df, + metadata=metadata, + design_factors=["group", "condition", "measurement"], + continuous_factors=["measurement"], + ) + + dds.deseq2() + dds.obsm["size_factors"] = r_size_factors.values + dds.varm["dispersions"] = r_dispersions.values + dds.varm["LFC"].iloc[:, 1] = r_res.log2FoldChange.values * np.log(2) + + res = DeseqStats(dds) + res.summary() + res.SE = r_res.lfcSE * np.log(2) + res.lfc_shrink(coeff="measurement") + shrunk_res = res.results_df + + # Check that the same LFC found (up to tol) + assert ( + abs(r_shrunk_res.log2FoldChange - shrunk_res.log2FoldChange) + / abs(r_shrunk_res.log2FoldChange) + ).max() < tol + + def test_contrast(): """ Check that the contrasts ['condition', 'B', 'A'] and ['condition', 'A', 'B'] give