Skip to content

Commit

Permalink
fix: replace underscores in factor names or factor levels with hyphens
Browse files Browse the repository at this point in the history
  • Loading branch information
BorisMuzellec committed Aug 4, 2023
1 parent de2b420 commit d75222d
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 6 deletions.
27 changes: 27 additions & 0 deletions pydeseq2/dds.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,33 @@ def __init__(
raise ValueError("NaNs are not allowed in the design factors.")
self.obs[self.design_factors] = self.obs[self.design_factors].astype(str)

# Check that design factors don't contain underscores. If so, convert them to
# hyphens.
if np.any(["_" in factor for factor in self.design_factors]):
warnings.warn(
"""Same factor names in the design contain underscores ('_'). They will
be converted to hyphens ('-').""",
UserWarning,
stacklevel=2,
)

new_factors = [factor.replace("_", "-") for factor in self.design_factors]

self.obs.rename(
columns={
old_factor: new_factor
for (old_factor, new_factor) in zip(self.design_factors, new_factors)
},
inplace=True,
)

self.design_factors = new_factors

# If ref_level has underscores, covert them to hyphens
# Don't raise a warning: it will be raised by build_design_matrix()
if ref_level is not None:
ref_level = [name.replace("_", "-") for name in ref_level]

# Build the design matrix
# Stored in the obsm attribute of the dataset
self.obsm["design_matrix"] = build_design_matrix(
Expand Down
16 changes: 16 additions & 0 deletions pydeseq2/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import multiprocessing
import warnings
from math import floor
from pathlib import Path
from typing import List
Expand Down Expand Up @@ -192,6 +193,21 @@ def build_design_matrix(
f"takes the single value '{np.unique(metadata[factor])}'."
)

# Check that level factors in the design don't contain underscores. If so, convert
# them to hyphens
warning_issued = False
for factor in design_factors:
if np.any(["_" in value for value in metadata[factor]]):
if not warning_issued:
warnings.warn(
"""Some factor levels in the design contain underscores ('_').
They will be converted to hyphens ('-').""",
UserWarning,
stacklevel=2,
)
warning_issued = True
metadata[factor] = metadata[factor].apply(lambda x: x.replace("_", "-"))

design_matrix = pd.get_dummies(metadata[design_factors], drop_first=not expanded)

if ref_level is not None:
Expand Down
14 changes: 8 additions & 6 deletions tests/test_edge_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,14 @@ def test_underscores_in_factors():
inplace=True,
)

# Run the pipeline. This should not cause bugs.
dds = DeseqDataSet(
counts=counts_df,
metadata=metadata,
design_factors="some_variable_with_underscores",
)
# Run the pipeline. This should raise a warning, but not cause bugs.
with pytest.warns(UserWarning):
dds = DeseqDataSet(
counts=counts_df,
metadata=metadata,
design_factors="some_variable_with_underscores",
ref_level=["some_variable_with_underscores", "level_with_underscores"],
)
dds.deseq2()

stat_res = DeseqStats(dds)
Expand Down

0 comments on commit d75222d

Please sign in to comment.