5
5
from collections .abc import Iterable
6
6
from typing import Any , Optional , Union
7
7
8
- import numpy
9
- import pandas
8
+ import numpy as np
9
+ import pandas as pd
10
10
from formulaic import ModelMatrix , ModelSpec
11
11
from formulaic .errors import FactorEncodingError
12
12
from formulaic .materializers import FormulaMaterializer
@@ -33,7 +33,7 @@ class TabmatMaterializer(FormulaMaterializer):
33
33
"""Materializer for pandas input and tabmat output."""
34
34
35
35
REGISTER_NAME = "tabmat"
36
- REGISTER_INPUTS = ("pandas .core.frame.DataFrame" , "pandas .DataFrame" )
36
+ REGISTER_INPUTS = ("pd .core.frame.DataFrame" , "pd .DataFrame" )
37
37
REGISTER_OUTPUTS = "tabmat"
38
38
39
39
@override
@@ -43,7 +43,7 @@ def _init(self):
43
43
"categorical_format" , "{name}[{category}]"
44
44
)
45
45
self .intercept_name = self .params .get ("intercept_name" , "Intercept" )
46
- self .dtype = self .params .get ("dtype" , numpy .float64 )
46
+ self .dtype = self .params .get ("dtype" , np .float64 )
47
47
self .sparse_threshold = self .params .get ("sparse_threshold" , 0.1 )
48
48
self .cat_threshold = self .params .get ("cat_threshold" , 4 )
49
49
self .add_column_for_intercept = self .params .get (
@@ -57,9 +57,9 @@ def _init(self):
57
57
58
58
@override
59
59
def _is_categorical (self , values ):
60
- if isinstance (values , (pandas .Series , pandas .Categorical )):
60
+ if isinstance (values , (pd .Series , pd .Categorical )):
61
61
return values .dtype == object or isinstance (
62
- values .dtype , pandas .CategoricalDtype
62
+ values .dtype , pd .CategoricalDtype
63
63
)
64
64
return super ()._is_categorical (values )
65
65
@@ -69,12 +69,12 @@ def _check_for_nulls(self, name, values, na_action, drop_rows):
69
69
return
70
70
71
71
if na_action is NAAction .RAISE :
72
- if isinstance (values , pandas .Series ) and values .isnull ().values .any ():
72
+ if isinstance (values , pd .Series ) and values .isnull ().values .any ():
73
73
raise ValueError (f"`{ name } ` contains null values after evaluation." )
74
74
75
75
elif na_action is NAAction .DROP :
76
- if isinstance (values , pandas .Series ):
77
- drop_rows .update (numpy .flatnonzero (values .isnull ().values ))
76
+ if isinstance (values , pd .Series ):
77
+ drop_rows .update (np .flatnonzero (values .isnull ().values ))
78
78
79
79
else :
80
80
raise ValueError (
@@ -83,17 +83,17 @@ def _check_for_nulls(self, name, values, na_action, drop_rows):
83
83
84
84
@override
85
85
def _encode_constant (self , value , metadata , encoder_state , spec , drop_rows ):
86
- series = value * numpy .ones (self .nrows - len (drop_rows ))
86
+ series = value * np .ones (self .nrows - len (drop_rows ))
87
87
return _InteractableDenseVector (series , name = self .intercept_name )
88
88
89
89
@override
90
90
def _encode_numerical (self , values , metadata , encoder_state , spec , drop_rows ):
91
91
if drop_rows :
92
92
values = values .drop (index = values .index [drop_rows ])
93
- if isinstance (values , pandas .Series ):
93
+ if isinstance (values , pd .Series ):
94
94
values = values .to_numpy ().astype (self .dtype )
95
95
if (values != 0 ).mean () <= self .sparse_threshold :
96
- return _InteractableSparseVector (sps .csc_matrix (values [:, numpy .newaxis ]))
96
+ return _InteractableSparseVector (sps .csc_matrix (values [:, np .newaxis ]))
97
97
else :
98
98
return _InteractableDenseVector (values )
99
99
@@ -118,7 +118,7 @@ def _encode_categorical(
118
118
def _combine_columns (self , cols , spec , drop_rows ):
119
119
# Special case no columns
120
120
if not cols :
121
- values = numpy .empty ((self .data .shape [0 ], 0 ), dtype = self .dtype )
121
+ values = np .empty ((self .data .shape [0 ], 0 ), dtype = self .dtype )
122
122
return DenseMatrix (values )
123
123
124
124
# Otherwise, concatenate columns into SplitMatrix
@@ -305,7 +305,7 @@ class _InteractableVector(ABC):
305
305
@abstractmethod
306
306
def to_tabmat (
307
307
self ,
308
- dtype : numpy . dtype ,
308
+ dtype : np . typing . DTypeLike ,
309
309
sparse_threshold : float ,
310
310
cat_threshold : int ,
311
311
) -> MatrixBase :
@@ -345,7 +345,7 @@ def set_name(self, name, name_format):
345
345
346
346
347
347
class _InteractableDenseVector (_InteractableVector ):
348
- def __init__ (self , values : numpy .ndarray , name : Optional [str ] = None ):
348
+ def __init__ (self , values : np .ndarray , name : Optional [str ] = None ):
349
349
self .values = values
350
350
self .name = name
351
351
@@ -358,7 +358,7 @@ def __rmul__(self, other):
358
358
359
359
def to_tabmat (
360
360
self ,
361
- dtype : numpy . dtype = numpy .float64 ,
361
+ dtype : np . typing . DTypeLike = np .float64 ,
362
362
sparse_threshold : float = 0.1 ,
363
363
cat_threshold : int = 4 ,
364
364
) -> Union [SparseMatrix , DenseMatrix ]:
@@ -367,7 +367,7 @@ def to_tabmat(
367
367
else :
368
368
# Columns can become sparser, but not denser through interactions
369
369
return SparseMatrix (
370
- sps .csc_matrix (self .values [:, numpy .newaxis ]), column_names = [self .name ]
370
+ sps .csc_matrix (self .values [:, np .newaxis ]), column_names = [self .name ]
371
371
)
372
372
373
373
def get_names (self ) -> list [str ]:
@@ -394,7 +394,7 @@ def __rmul__(self, other):
394
394
395
395
def to_tabmat (
396
396
self ,
397
- dtype : numpy . dtype = numpy .float64 ,
397
+ dtype : np . typing . DTypeLike = np .float64 ,
398
398
sparse_threshold : float = 0.1 ,
399
399
cat_threshold : int = 4 ,
400
400
) -> SparseMatrix :
@@ -413,9 +413,9 @@ def set_name(self, name, name_format=None) -> "_InteractableSparseVector":
413
413
class _InteractableCategoricalVector (_InteractableVector ):
414
414
def __init__ (
415
415
self ,
416
- codes : numpy .ndarray ,
416
+ codes : np .ndarray ,
417
417
categories : list [str ],
418
- multipliers : numpy .ndarray ,
418
+ multipliers : np .ndarray ,
419
419
name : Optional [str ] = None ,
420
420
):
421
421
# sentinel values for codes:
@@ -429,15 +429,15 @@ def __init__(
429
429
@classmethod
430
430
def from_categorical (
431
431
cls ,
432
- cat : pandas .Categorical ,
432
+ cat : pd .Categorical ,
433
433
reduced_rank : bool ,
434
434
missing_method : str = "fail" ,
435
435
missing_name : str = "(MISSING)" ,
436
436
add_missing_category : bool = False ,
437
437
) -> "_InteractableCategoricalVector" :
438
438
"""Create an interactable categorical vector from a pandas categorical."""
439
439
categories = list (cat .categories )
440
- codes = cat .codes .copy ().astype (numpy .int64 )
440
+ codes = cat .codes .copy ().astype (np .int64 )
441
441
442
442
if reduced_rank :
443
443
codes [codes == 0 ] = - 2
@@ -457,7 +457,7 @@ def from_categorical(
457
457
return cls (
458
458
codes = codes ,
459
459
categories = categories ,
460
- multipliers = numpy .ones (len (cat .codes )),
460
+ multipliers = np .ones (len (cat .codes )),
461
461
)
462
462
463
463
def __rmul__ (self , other ):
@@ -471,7 +471,7 @@ def __rmul__(self, other):
471
471
472
472
def to_tabmat (
473
473
self ,
474
- dtype : numpy . dtype = numpy .float64 ,
474
+ dtype : np . typing . DTypeLike = np .float64 ,
475
475
sparse_threshold : float = 0.1 ,
476
476
cat_threshold : int = 4 ,
477
477
) -> Union [DenseMatrix , CategoricalMatrix , SplitMatrix ]:
@@ -485,7 +485,7 @@ def to_tabmat(
485
485
else :
486
486
drop_first = False
487
487
488
- cat = pandas .Categorical .from_codes (
488
+ cat = pd .Categorical .from_codes (
489
489
codes = codes ,
490
490
categories = categories ,
491
491
ordered = False ,
@@ -502,12 +502,12 @@ def to_tabmat(
502
502
503
503
if (self .codes == - 2 ).all ():
504
504
# All values are dropped
505
- return DenseMatrix (numpy .empty ((len (codes ), 0 ), dtype = dtype ))
505
+ return DenseMatrix (np .empty ((len (codes ), 0 ), dtype = dtype ))
506
506
elif (self .multipliers == 1 ).all () and len (categories ) >= cat_threshold :
507
507
return categorical_part
508
508
else :
509
509
sparse_matrix = sps .csc_matrix (
510
- categorical_part .tocsr ().multiply (self .multipliers [:, numpy .newaxis ])
510
+ categorical_part .tocsr ().multiply (self .multipliers [:, np .newaxis ])
511
511
)
512
512
(
513
513
dense_part ,
@@ -744,7 +744,7 @@ def encode_contrasts(
744
744
f"Column { data .name } contains unseen categories: { unseen_categories } ."
745
745
)
746
746
747
- cat = pandas .Categorical (data ._values , categories = levels )
747
+ cat = pd .Categorical (data ._values , categories = levels )
748
748
_state ["categories" ] = cat .categories
749
749
_state ["add_missing_category" ] = add_missing_category or (
750
750
missing_method == "convert" and cat .isna ().any ()
0 commit comments