Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use narwhals to support Polars, cuDF, Modin, etc. #388

Merged
merged 40 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
75b6505
Add dependencies to pixi.toml
stanmart Sep 2, 2024
618b583
Pixi-ize pre-commit
stanmart Sep 2, 2024
d82ca13
Add pixi tasks
stanmart Sep 2, 2024
7c4df48
Update CI
stanmart Sep 2, 2024
15dfb30
Fix build dependencies
stanmart Sep 2, 2024
35a3c2d
update lockfile
stanmart Sep 2, 2024
415ff89
Fix doctest
stanmart Sep 2, 2024
2e5ceac
Try to fix readthedocs
stanmart Sep 2, 2024
6e6c4c5
Use latest pixi on conda-forge
stanmart Sep 2, 2024
ff09902
Find some minimum versions
stanmart Sep 2, 2024
5529d20
Bump minimum formulaic version
stanmart Sep 3, 2024
1e0d892
Find minimum numpy version
stanmart Sep 3, 2024
f94f98b
Make polars a test dependency
stanmart Sep 3, 2024
4fda475
Update lockfile
stanmart Sep 3, 2024
2007569
Fix typing issues
stanmart Sep 3, 2024
bee91e1
Fix benchmarks
stanmart Sep 3, 2024
0e7fb36
Update contributing docs
stanmart Sep 3, 2024
2a23b81
Make ruff happy
stanmart Sep 3, 2024
4f7a47c
Remove unnecessary pre-commit option from CI
stanmart Sep 10, 2024
f89a57a
first try
MarcAntoineSchmidtQC Sep 11, 2024
e33db3d
Added deprecation, docstring
MarcAntoineSchmidtQC Sep 12, 2024
56b7dbe
replace from_pandas and from_polars
MarcAntoineSchmidtQC Sep 12, 2024
1f8dd90
keep sorting
MarcAntoineSchmidtQC Sep 12, 2024
e02b241
Merge remote-tracking branch 'origin/main' into narwhals
MarcAntoineSchmidtQC Sep 12, 2024
6aed762
add narwhals to conda recipe
MarcAntoineSchmidtQC Sep 12, 2024
a9ea4eb
bump minimum narwhals version
MarcAntoineSchmidtQC Sep 12, 2024
bf1d303
added narwhals to setup.py
MarcAntoineSchmidtQC Sep 12, 2024
0c9df08
Changelog
MarcAntoineSchmidtQC Sep 12, 2024
86a6ebe
Fix categoricals with non-numpy-or-pandas input
stanmart Sep 13, 2024
c6c5f6b
Fix categoricals from numpy/list input
stanmart Sep 13, 2024
42ef8ac
Remove unnecessary import
stanmart Sep 13, 2024
34c4789
Merge branch 'main' into narwhals
stanmart Sep 13, 2024
1d58498
Merge fix from #387
stanmart Sep 13, 2024
64d10cd
Bump minimum narwhals version
stanmart Sep 13, 2024
beb2ee3
Merge branch 'main' into narwhals
stanmart Sep 23, 2024
d0528b7
Update tests
stanmart Sep 23, 2024
621391a
Remove unnecessary argument
stanmart Sep 23, 2024
b41bf2b
Simplify `_extract_codes_and_categories`
stanmart Sep 23, 2024
267c321
Make the check work with the new changes
stanmart Sep 23, 2024
c53d490
Import narwhals' stable v1 API
stanmart Sep 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda.recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ requirements:
- {{ pin_compatible('numpy') }}
- formulaic>=0.6
- scipy
- narwhals

test:
requires:
Expand Down
100 changes: 78 additions & 22 deletions pixi.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ formulaic = ">=0.6.4"
numpy = ">=1.24.0"
pandas = ">=1.4.4"
scipy = ">=1.7.3"
narwhals = ">=1.0.0"

[feature.dev.dependencies]
ipython = "*"
Expand Down Expand Up @@ -154,6 +155,7 @@ numpy = "1.24.0"
pandas = "1.4.4"
scipy = "1.7.3"
formulaic = "0.6.4"
narwhals = "1.0.0"

[environments]
default = ["dev", "test"]
Expand Down
4 changes: 2 additions & 2 deletions src/tabmat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import importlib.metadata

from .categorical_matrix import CategoricalMatrix
from .constructor import from_csc, from_formula, from_pandas, from_polars
from .constructor import from_csc, from_df, from_formula, from_pandas
from .dense_matrix import DenseMatrix
from .matrix_base import MatrixBase
from .sparse_matrix import SparseMatrix
Expand All @@ -23,7 +23,7 @@
"from_csc",
"from_formula",
"from_pandas",
"from_polars",
"from_df",
"as_tabmat",
"hstack",
]
204 changes: 62 additions & 142 deletions src/tabmat/constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections.abc import Mapping
from typing import Any, Optional, Union

import narwhals as nw
import numpy as np
from formulaic import Formula, ModelSpec
from formulaic.materializers.types import NAAction
Expand All @@ -28,67 +29,9 @@
pd = None # type: ignore


def _is_boolean(series, engine: str):
if engine == "pandas":
return pd.api.types.is_bool_dtype(series)
elif engine == "polars":
return series.dtype.is_(pl.Boolean)
else:
raise ValueError(f"Unknown engine: {engine}")


def _is_numeric(series, engine: str):
if engine == "pandas":
return pd.api.types.is_numeric_dtype(series)
elif engine == "polars":
return series.dtype.is_numeric()
else:
raise ValueError(f"Unknown engine: {engine}")


def _iter_columns(df, engine: str):
if engine == "pandas":
return df.items()
elif engine == "polars":
return ((col.name, col) for col in df.iter_columns())
else:
raise ValueError(f"Unknown engine: {engine}")


def _object_as_cat(series, engine: str):
if engine == "pandas":
if series.dtype == object:
return series.astype("category")
return series
elif engine == "polars":
if series.dtype == pl.String:
return series.cast(pl.Categorical)
return series
else:
raise ValueError(f"Unknown engine: {engine}")


def _is_categorical(series, engine: str):
if engine == "pandas":
return isinstance(series.dtype, pd.CategoricalDtype)
elif engine == "polars":
return isinstance(series.dtype, (pl.Categorical, pl.Enum))
else:
raise ValueError(f"Unknown engine: {engine}")


def _select_cols(df, idx, engine):
if engine == "pandas":
return df.iloc[:, idx]
elif engine == "polars":
return df.select(pl.nth(idx))
else:
raise ValueError(f"Unknown engine: {engine}")


def _from_dataframe(
@nw.narwhalify(eager_only=True)
def from_df(
df,
engine: str,
dtype: np.dtype = np.float64,
sparse_threshold: float = 0.1,
cat_threshold: int = 4,
Expand All @@ -100,11 +43,45 @@ def _from_dataframe(
cat_missing_name: str = "(MISSING)",
) -> MatrixBase:
"""
See docstring of from_pandas or from_polars for details.
Transform a DataFrame into an efficient SplitMatrix.

engine should be either 'pandas' or 'polars'.
"""
Parameters
----------
df : DataFrame
This can be any dataframes supported by narwhals (pandas, polars, etc.).
dtype : np.dtype, default np.float64
dtype of all sub-matrices of the resulting SplitMatrix.
sparse_threshold : float, default 0.1
Density threshold below which numerical columns will be stored in a sparse
format.
cat_threshold : int, default 4
Number of levels of a categorical column under which the column will be stored
as sparse one-hot-encoded columns instead of CategoricalMatrix
object_as_cat : bool, default False
If True, DataFrame columns stored as python objects will be treated as
categorical columns.
cat_position : str {'end'|'expand'}, default 'expand'
Position of the categorical variable in the index. If "last", all the
categoricals (including the ones that did not satisfy cat_threshold)
will be placed at the end of the index list. If "expand", all the variables
will remain in the same order.
drop_first : bool, default False
If true, categoricals variables will have their first category dropped.
This allows multiple categorical variables to be included in an
unregularized model. If False, all categories are included.
cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail'
How to handle missing values in categorical columns:
- if 'fail', raise an error if there are missing values.
- if 'zero', missing values will represent all-zero indicator columns.
- if 'convert', missing values will be converted to the '(MISSING)' category.
cat_missing_name: str, default '(MISSING)'
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``.

Returns
-------
SplitMatrix
"""
matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = []
indices: list[np.ndarray] = []
is_cat: list[bool] = []
Expand All @@ -117,12 +94,23 @@ def _from_dataframe(

mxcolidx = 0

for dfcolidx, (colname, coldata) in enumerate(_iter_columns(df, engine)):
for dfcolidx, colname in enumerate(df.columns):
coldata = df[:, dfcolidx]
if object_as_cat:
coldata = _object_as_cat(coldata, engine)
if _is_categorical(coldata, engine):
if isinstance(coldata.dtype, (nw.String, nw.Object)):
coldata = coldata.cast(nw.Categorical)

# deal with Pandas sparse dtype (not supported by narwhals)
if pd is not None:
if isinstance(nw.to_native(coldata).dtype, pd.SparseDtype):
sparse_dfidx.append(dfcolidx)
sparse_tmidx.append(mxcolidx)
mxcolidx += 1
continue

if isinstance(coldata.dtype, (nw.Categorical, nw.Enum)):
cat = CategoricalMatrix(
coldata,
nw.to_native(coldata),
drop_first=drop_first,
dtype=dtype,
column_name=colname,
Expand Down Expand Up @@ -163,7 +151,7 @@ def _from_dataframe(
mxcolidx += cat.shape[1]
elif cat_position == "end":
indices.append(np.arange(cat.shape[1]))
elif _is_boolean(coldata, engine):
elif isinstance(coldata.dtype, nw.Boolean):
if (coldata != False).mean() <= sparse_threshold: # noqa E712
sparse_dfidx.append(dfcolidx)
sparse_tmidx.append(mxcolidx)
Expand All @@ -172,7 +160,7 @@ def _from_dataframe(
dense_dfidx.append(dfcolidx)
dense_tmidx.append(mxcolidx)
mxcolidx += 1
elif _is_numeric(coldata, engine):
elif isinstance(coldata.dtype, nw.dtypes.NumericType):
if (coldata != 0).mean() <= sparse_threshold:
sparse_dfidx.append(dfcolidx)
sparse_tmidx.append(mxcolidx)
Expand All @@ -181,7 +169,6 @@ def _from_dataframe(
dense_dfidx.append(dfcolidx)
dense_tmidx.append(mxcolidx)
mxcolidx += 1

else:
ignored_cols.append(colname)

Expand All @@ -192,7 +179,7 @@ def _from_dataframe(
if dense_dfidx:
matrices.append(
DenseMatrix(
_select_cols(df, dense_dfidx, engine).to_numpy().astype(dtype),
df[:, dense_dfidx].to_numpy().astype(dtype),
column_names=np.asarray(df.columns)[dense_dfidx],
term_names=np.asarray(df.columns)[dense_dfidx],
)
Expand All @@ -202,7 +189,7 @@ def _from_dataframe(
if sparse_dfidx:
matrices.append(
SparseMatrix(
sps.coo_matrix(_select_cols(df, sparse_dfidx, engine), dtype=dtype),
sps.coo_matrix(df[:, sparse_dfidx], dtype=dtype),
dtype=dtype,
column_names=np.asarray(df.columns)[sparse_dfidx],
term_names=np.asarray(df.columns)[sparse_dfidx],
Expand Down Expand Up @@ -235,6 +222,8 @@ def from_pandas(
cat_missing_name: str = "(MISSING)",
) -> MatrixBase:
"""
Deprecated. Please use `from_df` instead.

Transform a pandas.DataFrame into an efficient SplitMatrix.

Parameters
Expand Down Expand Up @@ -274,77 +263,8 @@ def from_pandas(
-------
SplitMatrix
"""
return _from_dataframe(
df,
engine="pandas",
dtype=dtype,
sparse_threshold=sparse_threshold,
cat_threshold=cat_threshold,
object_as_cat=object_as_cat,
cat_position=cat_position,
drop_first=drop_first,
categorical_format=categorical_format,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
)


def from_polars(
df,
dtype: np.dtype = np.float64,
sparse_threshold: float = 0.1,
cat_threshold: int = 4,
object_as_cat: bool = False,
cat_position: str = "expand",
drop_first: bool = False,
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
) -> MatrixBase:
"""
Transform a polars.DataFrame into an efficient SplitMatrix.

Parameters
----------
df : pl.DataFrame
Polars DataFrame to convert.
dtype : np.dtype, default np.float64
dtype of all sub-matrices of the resulting SplitMatrix.
sparse_threshold : float, default 0.1
Density threshold below which numerical columns will be stored in a sparse
format.
cat_threshold : int, default 4
Number of levels of a categorical column under which the column will be stored
as sparse one-hot-encoded columns instead of CategoricalMatrix
object_as_cat : bool, default False
If True, DataFrame columns stored as ``pl.String`` will be treated as
categorical columns. Note that this is different from pandas, where all object
columns are converted to categorical columns.
cat_position : str {'end'|'expand'}, default 'expand'
Position of the categorical variable in the index. If "last", all the
categoricals (including the ones that did not satisfy cat_threshold)
will be placed at the end of the index list. If "expand", all the variables
will remain in the same order.
drop_first : bool, default False
If true, categoricals variables will have their first category dropped.
This allows multiple categorical variables to be included in an
unregularized model. If False, all categories are included.
cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail'
How to handle missing values in categorical columns:
- if 'fail', raise an error if there are missing values.
- if 'zero', missing values will represent all-zero indicator columns.
- if 'convert', missing values will be converted to the '(MISSING)' category.
cat_missing_name: str, default '(MISSING)'
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``.

Returns
-------
SplitMatrix
"""
return _from_dataframe(
return from_df(
df,
engine="polars",
dtype=dtype,
sparse_threshold=sparse_threshold,
cat_threshold=cat_threshold,
Expand Down
Loading
Loading