From 16bd92599b64109126aaa40760e54e545c858479 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Fri, 8 Dec 2023 10:43:56 +0100 Subject: [PATCH] move feature name assignment to right spot --- src/glum/_glm.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 3afaa54e..95e7ad42 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -228,6 +228,20 @@ def _check_offset( return offset +def _name_categorical_variables( + categories: tuple[str], column_name: str, drop_first: bool +): + new_names = [ + f"{column_name}__{category}" for category in categories[int(drop_first) :] + ] + if len(new_names) == 0: + raise ValueError( + f"Categorical column: {column_name}, contains only one category. " + + "This should be dropped from the feature matrix." + ) + return new_names + + def _parse_formula( formula: FormulaSpec, include_intercept: bool = True ) -> tuple[Optional[Formula], Formula]: @@ -2696,16 +2710,6 @@ def _set_up_and_check_fit_args( self.term_names_ = list( chain.from_iterable( [term] * len(cols) for term, _, cols in X.model_spec.structure - - if any(X.dtypes == "category"): - self.feature_names_ = list( - chain.from_iterable( - _name_categorical_variables( - dtype.categories, column, getattr(self, "drop_first", False) - ) - if isinstance(dtype, pd.CategoricalDtype) - else [column] - for column, dtype in zip(X.columns, X.dtypes) ) ) @@ -2715,6 +2719,17 @@ def _set_up_and_check_fit_args( self.feature_dtypes_ = X.dtypes.to_dict() if any(X.dtypes == "category"): + + self.feature_names_ = list( + chain.from_iterable( + _name_categorical_variables( + dtype.categories, column, getattr(self, "drop_first", False) + ) + if isinstance(dtype, pd.CategoricalDtype) + else [column] + for column, dtype in zip(X.columns, X.dtypes) + ) + ) def _expand_categorical_penalties(penalty, X, drop_first): """