Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda.recipe/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ requirements:
- python
- formulaic >=0.6
- joblib
- narwhals >=2.0.0
- numexpr
- packaging
- pandas
Expand Down
13 changes: 13 additions & 0 deletions pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ wheel = "*"

[dependencies]
formulaic = "*"
narwhals = ">=2.0.0"
numexpr = "*"
packaging = "*"
pandas = ">=1.4"
Expand Down Expand Up @@ -134,7 +135,7 @@ cxx-compiler = "*"
cython = "!=3.0.4"
make = "*"
mako = "*"
narwhals = ">=1.4.1"
narwhals = ">=2.0.0"
pip = "*"
setuptools-scm = "*"
xsimd = "<11|>12.1"
Expand All @@ -154,6 +155,7 @@ python = "3.13.*"

[feature.oldies.dependencies]
formulaic = "0.6.*"
narwhals = "2.0.*"
pandas = "1.4.*"
python = "3.9.*"
scikit-learn = "0.24.*"
Expand Down
111 changes: 79 additions & 32 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
import typing
import warnings
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Optional, Union
from typing import Any, Optional, Union, cast

import formulaic
import narwhals.stable.v2 as nw
import numpy as np
import packaging.version
import pandas as pd
import scipy.sparse as sps
import sklearn as skl
import tabmat as tm
from narwhals.typing import IntoDataFrame
from scipy import linalg, sparse, stats
from typing_extensions import deprecated

from ._distribution import (
BinomialDistribution,
Expand All @@ -38,7 +41,13 @@
_least_squares_solver,
_trust_constr_solver,
)
from ._typing import ArrayLike, ShapedArrayLike, VectorLike, WaldTestResult
from ._typing import (
ArrayLike,
ShapedArrayLike,
ShapedArrayLikeConverted,
VectorLike,
WaldTestResult,
)
from ._utils import (
add_missing_categories,
align_df_categories,
Expand Down Expand Up @@ -174,6 +183,29 @@ def link_instance(self) -> Link:
else:
return get_link(self.link, self.family_instance)

@property
def categorical_levels_(self) -> dict[str, list[str]]:
if hasattr(self, "_categorical_levels_"):
return self._categorical_levels_
if hasattr(self, "feature_dtypes_"):
# Compatibility with pickled models
return {
col: dtype.categories.tolist()
for col, dtype in self.feature_dtypes_.items()
if isinstance(dtype, pd.CategoricalDtype)
}
raise AttributeError("No categorical levels stored.")

@property
@deprecated("Use `categorical_levels_` instead.")
def feature_dtypes_(self) -> dict[str, Any]:
return self._feature_dtypes_

@feature_dtypes_.setter
@deprecated("Use `categorical_levels_` instead.")
def feature_dtypes_(self, value: dict[str, Any]) -> None:
self._feature_dtypes_ = value

def _get_start_coef(
self,
X: Union[tm.MatrixBase, tm.StandardizedMatrix],
Expand Down Expand Up @@ -245,9 +277,9 @@ def _get_start_coef(

return coef

def _convert_from_pandas(
def _convert_from_df(
self,
df: pd.DataFrame,
df: IntoDataFrame,
context: Optional[Mapping[str, Any]] = None,
) -> tm.MatrixBase:
"""Convert a pandas data frame to a tabmat matrix."""
Expand All @@ -256,25 +288,27 @@ def _convert_from_pandas(

cat_missing_method_after_alignment = getattr(self, "cat_missing_method", "fail")

if hasattr(self, "feature_dtypes_"):
df = nw.from_native(df)

if hasattr(self, "categorical_levels_"):
df = align_df_categories(
df,
self.feature_dtypes_,
self.categorical_levels_,
getattr(self, "has_missing_category_", {}),
cat_missing_method_after_alignment,
)
if cat_missing_method_after_alignment == "convert":
df = add_missing_categories(
df=df,
dtypes=self.feature_dtypes_,
categorical_levels=self.categorical_levels_,
feature_names=self.feature_names_,
cat_missing_name=self.cat_missing_name,
categorical_format=self.categorical_format,
)
# there should be no missing categories after this
cat_missing_method_after_alignment = "fail"

X = tm.from_pandas(
X = tm.from_df(
df,
drop_first=self.drop_first,
categorical_format=getattr( # convention prior to v3
Expand Down Expand Up @@ -718,8 +752,8 @@ def linear_predictor(
elif alpha is not None:
alpha_index = [self._find_alpha_index(a) for a in alpha] # type: ignore

if isinstance(X, pd.DataFrame):
X = self._convert_from_pandas(X, context=capture_context(context))
if nw.dependencies.is_into_dataframe(X):
X = self._convert_from_df(X, context=capture_context(context))

X = check_array_tabmat_compliant(
X,
Expand Down Expand Up @@ -807,8 +841,9 @@ def predict(
array, shape (n_samples, n_alphas)
Predicted values times ``sample_weight``.
"""
if isinstance(X, pd.DataFrame):
X = self._convert_from_pandas(X, context=capture_context(context))
if nw.dependencies.is_into_dataframe(X):
X = self._convert_from_df(X, context=capture_context(context))
X = cast(ShapedArrayLikeConverted, X)

eta = self.linear_predictor(
X, offset=offset, alpha_index=alpha_index, alpha=alpha, context=context
Expand Down Expand Up @@ -1452,8 +1487,8 @@ def covariance_matrix(
y = self.y_model_spec_.get_model_matrix(X).toarray().ravel()
# This has to go first because X is modified in the next line

if isinstance(X, pd.DataFrame):
X = self._convert_from_pandas(X, context=capture_context(context))
if nw.dependencies.is_into_dataframe(X):
X = self._convert_from_df(X, context=capture_context(context))

X, y = check_X_y_tabmat_compliant(
X,
Expand Down Expand Up @@ -1566,8 +1601,8 @@ def covariance_matrix(
def score(
self,
X: ShapedArrayLike,
y: ShapedArrayLike,
sample_weight: Optional[ArrayLike] = None,
y: VectorLike,
sample_weight: Optional[VectorLike] = None,
offset: Optional[ArrayLike] = None,
*,
context: Optional[Union[int, Mapping[str, Any]]] = None,
Expand Down Expand Up @@ -1724,7 +1759,7 @@ def _should_copy_X(self):
def _set_up_and_check_fit_args(
self,
X: ArrayLike,
y: Optional[ArrayLike],
y: Optional[VectorLike],
sample_weight: Optional[VectorLike],
offset: Optional[VectorLike],
force_all_finite,
Expand All @@ -1747,8 +1782,8 @@ def _set_up_and_check_fit_args(
copy_X = self._should_copy_X()
drop_first = getattr(self, "drop_first", False)

if isinstance(X, pd.DataFrame):
if hasattr(self, "formula") and self.formula is not None:
if nw.dependencies.is_into_dataframe(X):
if getattr(self, "formula", None) is not None:
lhs, rhs = parse_formula(
self.formula, include_intercept=self.fit_intercept
)
Expand Down Expand Up @@ -1802,24 +1837,36 @@ def _set_up_and_check_fit_args(
else:
# Maybe TODO: expand categorical penalties with formulas

self.feature_dtypes_ = X.dtypes.to_dict()
# Backwards compatibility
if isinstance(X, pd.DataFrame):
self.feature_dtypes_ = X.dtypes.to_dict()

X = cast(nw.DataFrame, nw.from_native(X)) # avoid inferring `Never`

self._categorical_levels_ = {
col: X[col].cat.get_categories().to_list()
for col, dtype in X.schema.items()
if isinstance(dtype, (nw.Categorical, nw.Enum))
}

self.has_missing_category_ = {
col: (getattr(self, "cat_missing_method", "fail") == "convert")
and X[col].isna().any()
for col, dtype in self.feature_dtypes_.items()
if isinstance(dtype, pd.CategoricalDtype)
and X[col].is_null().any()
for col in self.categorical_levels_
}

if any(X.dtypes == "category"):
if any(
isinstance(dtype, (nw.Categorical, nw.Enum))
for dtype in X.schema.values()
):
P1 = expand_categorical_penalties(
self.P1, X, drop_first, self.has_missing_category_
)
P2 = expand_categorical_penalties(
self.P2, X, drop_first, self.has_missing_category_
)

X = tm.from_pandas(
X = tm.from_df(
X,
drop_first=drop_first,
categorical_format=getattr( # convention prior to v3
Expand All @@ -1841,7 +1888,7 @@ def _set_up_and_check_fit_args(
"The X matrix is noncontiguous and copy_X = False."
"To fix this, either set copy_X = None or pass a contiguous matrix."
)
X = X.copy()
X = X.copy() # TODO: not all dataframes can be copied like this

if (
not isinstance(X, tm.CategoricalMatrix)
Expand Down Expand Up @@ -2672,8 +2719,8 @@ def fit(

def _compute_information_criteria(
self,
X: ShapedArrayLike,
y: ShapedArrayLike,
X: ShapedArrayLikeConverted,
y: VectorLike,
sample_weight: Optional[ArrayLike] = None,
context: Optional[Mapping[str, Any]] = None,
):
Expand Down Expand Up @@ -2732,7 +2779,7 @@ def _compute_information_criteria(
def aic(
self,
X: ArrayLike,
y: ArrayLike,
y: VectorLike,
sample_weight: Optional[ArrayLike] = None,
*,
context: Optional[Union[int, Mapping[str, Any]]] = None,
Expand Down Expand Up @@ -2769,7 +2816,7 @@ def aic(
def aicc(
self,
X: ArrayLike,
y: ArrayLike,
y: VectorLike,
sample_weight: Optional[ArrayLike] = None,
*,
context: Optional[Union[int, Mapping[str, Any]]] = None,
Expand Down Expand Up @@ -2814,7 +2861,7 @@ def aicc(
def bic(
self,
X: ArrayLike,
y: ArrayLike,
y: VectorLike,
sample_weight: Optional[ArrayLike] = None,
*,
context: Optional[Union[int, Mapping[str, Any]]] = None,
Expand Down Expand Up @@ -2853,7 +2900,7 @@ def _get_info_criteria(
self,
crit: str,
X: ArrayLike,
y: ArrayLike,
y: VectorLike,
sample_weight: Optional[ArrayLike] = None,
context: Optional[Union[int, Mapping[str, Any]]] = None,
):
Expand Down
12 changes: 10 additions & 2 deletions src/glum/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,30 @@
import pandas as pd
import scipy.sparse
import tabmat as tm
from narwhals.typing import IntoDataFrame

VectorLike = Union[np.ndarray, pd.api.extensions.ExtensionArray, pd.Index, pd.Series]

ArrayLike = Union[
list,
tm.MatrixBase,
tm.StandardizedMatrix,
pd.DataFrame,
IntoDataFrame,
scipy.sparse.spmatrix,
VectorLike,
]

ShapedArrayLike = Union[
tm.MatrixBase,
tm.StandardizedMatrix,
pd.DataFrame,
IntoDataFrame,
scipy.sparse.spmatrix,
VectorLike,
]

ShapedArrayLikeConverted = Union[
tm.MatrixBase,
tm.StandardizedMatrix,
scipy.sparse.spmatrix,
VectorLike,
]
Expand Down
Loading
Loading