1
1
"""Define the PySRRegressor scikit-learn interface."""
2
2
3
3
import copy
4
- import difflib
5
- import inspect
6
4
import os
7
5
import pickle as pkl
8
6
import re
57
55
_preprocess_julia_floats ,
58
56
_safe_check_feature_names_in ,
59
57
_subscriptify ,
58
+ _suggest_keywords ,
60
59
)
61
60
62
61
ALREADY_RAN = False
@@ -122,7 +121,7 @@ def _maybe_create_inline_operators(
122
121
"and underscores are allowed."
123
122
)
124
123
if (extra_sympy_mappings is None ) or (
125
- not function_name in extra_sympy_mappings
124
+ function_name not in extra_sympy_mappings
126
125
):
127
126
raise ValueError (
128
127
f"Custom function { function_name } is not defined in `extra_sympy_mappings`. "
@@ -139,6 +138,7 @@ def _check_assertions(
139
138
X ,
140
139
use_custom_variable_names ,
141
140
variable_names ,
141
+ complexity_of_variables ,
142
142
weights ,
143
143
y ,
144
144
X_units ,
@@ -163,6 +163,13 @@ def _check_assertions(
163
163
"and underscores are allowed."
164
164
)
165
165
assert_valid_sympy_symbol (var_name )
166
+ if (
167
+ isinstance (complexity_of_variables , list )
168
+ and len (complexity_of_variables ) != X .shape [1 ]
169
+ ):
170
+ raise ValueError (
171
+ "The number of elements in `complexity_of_variables` must equal the number of features in `X`."
172
+ )
166
173
if X_units is not None and len (X_units ) != X .shape [1 ]:
167
174
raise ValueError (
168
175
"The number of units in `X_units` must equal the number of features in `X`."
@@ -333,7 +340,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
333
340
`idx` argument to the function, which is `nothing`
334
341
for non-batched, and a 1D array of indices for batched.
335
342
Default is `None`.
336
- complexity_of_operators : dict[str, float]
343
+ complexity_of_operators : dict[str, Union[int, float] ]
337
344
If you would like to use a complexity other than 1 for an
338
345
operator, specify the complexity here. For example,
339
346
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
@@ -342,10 +349,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
342
349
numbers for a complexity, and the total complexity of a tree
343
350
will be rounded to the nearest integer after computing.
344
351
Default is `None`.
345
- complexity_of_constants : float
352
+ complexity_of_constants : int | float
346
353
Complexity of constants. Default is `1`.
347
- complexity_of_variables : float
348
- Complexity of variables. Default is `1`.
354
+ complexity_of_variables : int | float
355
+ Global complexity of variables. To set different complexities for
356
+ different variables, pass a list of complexities to the `fit` method
357
+ with keyword `complexity_of_variables`. You cannot use both.
358
+ Default is `1`.
349
359
parsimony : float
350
360
Multiplicative factor for how much to punish complexity.
351
361
Default is `0.0032`.
@@ -691,6 +701,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
691
701
n_features_in_ : int
692
702
feature_names_in_ : ArrayLike [str ]
693
703
display_feature_names_in_ : ArrayLike [str ]
704
+ complexity_of_variables_ : Union [int , float , List [Union [int , float ]], None ]
694
705
X_units_ : Union [ArrayLike [str ], None ]
695
706
y_units_ : Union [str , ArrayLike [str ], None ]
696
707
nout_ : int
@@ -722,7 +733,7 @@ def __init__(
722
733
loss_function : Optional [str ] = None ,
723
734
complexity_of_operators : Optional [Dict [str , Union [int , float ]]] = None ,
724
735
complexity_of_constants : Union [int , float ] = 1 ,
725
- complexity_of_variables : Union [int , float ] = 1 ,
736
+ complexity_of_variables : Optional [ Union [int , float ]] = None ,
726
737
parsimony : float = 0.0032 ,
727
738
dimensional_constraint_penalty : Optional [float ] = None ,
728
739
dimensionless_constants_only : bool = False ,
@@ -1344,13 +1355,22 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams:
1344
1355
return param_container
1345
1356
1346
1357
def _validate_and_set_fit_params (
1347
- self , X , y , Xresampled , weights , variable_names , X_units , y_units
1358
+ self ,
1359
+ X ,
1360
+ y ,
1361
+ Xresampled ,
1362
+ weights ,
1363
+ variable_names ,
1364
+ complexity_of_variables ,
1365
+ X_units ,
1366
+ y_units ,
1348
1367
) -> Tuple [
1349
1368
ndarray ,
1350
1369
ndarray ,
1351
1370
Optional [ndarray ],
1352
1371
Optional [ndarray ],
1353
1372
ArrayLike [str ],
1373
+ Union [int , float , List [Union [int , float ]]],
1354
1374
Optional [ArrayLike [str ]],
1355
1375
Optional [Union [str , ArrayLike [str ]]],
1356
1376
]:
@@ -1375,6 +1395,8 @@ def _validate_and_set_fit_params(
1375
1395
for that particular element of y.
1376
1396
variable_names : ndarray of length n_features
1377
1397
Names of each variable in the training dataset, `X`.
1398
+ complexity_of_variables : int | float | list[int | float]
1399
+ Complexity of each variable in the training dataset, `X`.
1378
1400
X_units : list[str] of length n_features
1379
1401
Units of each variable in the training dataset, `X`.
1380
1402
y_units : str | list[str] of length n_out
@@ -1422,6 +1444,22 @@ def _validate_and_set_fit_params(
1422
1444
"Please use valid names instead."
1423
1445
)
1424
1446
1447
+ if (
1448
+ complexity_of_variables is not None
1449
+ and self .complexity_of_variables is not None
1450
+ ):
1451
+ raise ValueError (
1452
+ "You cannot set `complexity_of_variables` at both `fit` and `__init__`. "
1453
+ "Pass it at `__init__` to set it to global default, OR use `fit` to set it for "
1454
+ "each variable individually."
1455
+ )
1456
+ elif complexity_of_variables is not None :
1457
+ complexity_of_variables = complexity_of_variables
1458
+ elif self .complexity_of_variables is not None :
1459
+ complexity_of_variables = self .complexity_of_variables
1460
+ else :
1461
+ complexity_of_variables = 1
1462
+
1425
1463
# Data validation and feature name fetching via sklearn
1426
1464
# This method sets the n_features_in_ attribute
1427
1465
if Xresampled is not None :
@@ -1452,10 +1490,20 @@ def _validate_and_set_fit_params(
1452
1490
else :
1453
1491
raise NotImplementedError ("y shape not supported!" )
1454
1492
1493
+ self .complexity_of_variables_ = copy .deepcopy (complexity_of_variables )
1455
1494
self .X_units_ = copy .deepcopy (X_units )
1456
1495
self .y_units_ = copy .deepcopy (y_units )
1457
1496
1458
- return X , y , Xresampled , weights , variable_names , X_units , y_units
1497
+ return (
1498
+ X ,
1499
+ y ,
1500
+ Xresampled ,
1501
+ weights ,
1502
+ variable_names ,
1503
+ complexity_of_variables ,
1504
+ X_units ,
1505
+ y_units ,
1506
+ )
1459
1507
1460
1508
def _validate_data_X_y (self , X , y ) -> Tuple [ndarray , ndarray ]:
1461
1509
raw_out = self ._validate_data (X = X , y = y , reset = True , multi_output = True ) # type: ignore
@@ -1471,6 +1519,7 @@ def _pre_transform_training_data(
1471
1519
y : ndarray ,
1472
1520
Xresampled : Union [ndarray , None ],
1473
1521
variable_names : ArrayLike [str ],
1522
+ complexity_of_variables : Union [int , float , List [Union [int , float ]]],
1474
1523
X_units : Union [ArrayLike [str ], None ],
1475
1524
y_units : Union [ArrayLike [str ], str , None ],
1476
1525
random_state : np .random .RandomState ,
@@ -1493,6 +1542,8 @@ def _pre_transform_training_data(
1493
1542
variable_names : list[str]
1494
1543
Names of each variable in the training dataset, `X`.
1495
1544
Of length `n_features`.
1545
+ complexity_of_variables : int | float | list[int | float]
1546
+ Complexity of each variable in the training dataset, `X`.
1496
1547
X_units : list[str]
1497
1548
Units of each variable in the training dataset, `X`.
1498
1549
y_units : str | list[str]
@@ -1543,6 +1594,14 @@ def _pre_transform_training_data(
1543
1594
],
1544
1595
)
1545
1596
1597
+ if isinstance (complexity_of_variables , list ):
1598
+ complexity_of_variables = [
1599
+ complexity_of_variables [i ]
1600
+ for i in range (len (complexity_of_variables ))
1601
+ if selection_mask [i ]
1602
+ ]
1603
+ self .complexity_of_variables_ = copy .deepcopy (complexity_of_variables )
1604
+
1546
1605
if X_units is not None :
1547
1606
X_units = cast (
1548
1607
ArrayLike [str ],
@@ -1567,7 +1626,7 @@ def _pre_transform_training_data(
1567
1626
else :
1568
1627
X , y = denoise (X , y , Xresampled = Xresampled , random_state = random_state )
1569
1628
1570
- return X , y , variable_names , X_units , y_units
1629
+ return X , y , variable_names , complexity_of_variables , X_units , y_units
1571
1630
1572
1631
def _run (
1573
1632
self ,
@@ -1624,6 +1683,7 @@ def _run(
1624
1683
1625
1684
nested_constraints = self .nested_constraints
1626
1685
complexity_of_operators = self .complexity_of_operators
1686
+ complexity_of_variables = self .complexity_of_variables_
1627
1687
cluster_manager = self .cluster_manager
1628
1688
1629
1689
# Start julia backend processes
@@ -1668,6 +1728,9 @@ def _run(
1668
1728
complexity_of_operators = jl .seval (complexity_of_operators_str )
1669
1729
# TODO: Refactor this into helper function
1670
1730
1731
+ if isinstance (complexity_of_variables , list ):
1732
+ complexity_of_variables = jl_array (complexity_of_variables )
1733
+
1671
1734
custom_loss = jl .seval (
1672
1735
str (self .elementwise_loss )
1673
1736
if self .elementwise_loss is not None
@@ -1726,7 +1789,7 @@ def _run(
1726
1789
una_constraints = jl_array (una_constraints ),
1727
1790
complexity_of_operators = complexity_of_operators ,
1728
1791
complexity_of_constants = self .complexity_of_constants ,
1729
- complexity_of_variables = self . complexity_of_variables ,
1792
+ complexity_of_variables = complexity_of_variables ,
1730
1793
nested_constraints = nested_constraints ,
1731
1794
elementwise_loss = custom_loss ,
1732
1795
loss_function = custom_full_objective ,
@@ -1871,6 +1934,9 @@ def fit(
1871
1934
Xresampled = None ,
1872
1935
weights = None ,
1873
1936
variable_names : Optional [ArrayLike [str ]] = None ,
1937
+ complexity_of_variables : Optional [
1938
+ Union [int , float , List [Union [int , float ]]]
1939
+ ] = None ,
1874
1940
X_units : Optional [ArrayLike [str ]] = None ,
1875
1941
y_units : Optional [Union [str , ArrayLike [str ]]] = None ,
1876
1942
) -> "PySRRegressor" :
@@ -1931,6 +1997,7 @@ def fit(
1931
1997
self .selection_mask_ = None
1932
1998
self .julia_state_stream_ = None
1933
1999
self .julia_options_stream_ = None
2000
+ self .complexity_of_variables_ = None
1934
2001
self .X_units_ = None
1935
2002
self .y_units_ = None
1936
2003
@@ -1944,10 +2011,18 @@ def fit(
1944
2011
Xresampled ,
1945
2012
weights ,
1946
2013
variable_names ,
2014
+ complexity_of_variables ,
1947
2015
X_units ,
1948
2016
y_units ,
1949
2017
) = self ._validate_and_set_fit_params (
1950
- X , y , Xresampled , weights , variable_names , X_units , y_units
2018
+ X ,
2019
+ y ,
2020
+ Xresampled ,
2021
+ weights ,
2022
+ variable_names ,
2023
+ complexity_of_variables ,
2024
+ X_units ,
2025
+ y_units ,
1951
2026
)
1952
2027
1953
2028
if X .shape [0 ] > 10000 and not self .batching :
@@ -1965,8 +2040,17 @@ def fit(
1965
2040
seed = cast (int , random_state .randint (0 , 2 ** 31 - 1 )) # For julia random
1966
2041
1967
2042
# Pre transformations (feature selection and denoising)
1968
- X , y , variable_names , X_units , y_units = self ._pre_transform_training_data (
1969
- X , y , Xresampled , variable_names , X_units , y_units , random_state
2043
+ X , y , variable_names , complexity_of_variables , X_units , y_units = (
2044
+ self ._pre_transform_training_data (
2045
+ X ,
2046
+ y ,
2047
+ Xresampled ,
2048
+ variable_names ,
2049
+ complexity_of_variables ,
2050
+ X_units ,
2051
+ y_units ,
2052
+ random_state ,
2053
+ )
1970
2054
)
1971
2055
1972
2056
# Warn about large feature counts (still warn if feature count is large
@@ -1993,6 +2077,7 @@ def fit(
1993
2077
X ,
1994
2078
use_custom_variable_names ,
1995
2079
variable_names ,
2080
+ complexity_of_variables ,
1996
2081
weights ,
1997
2082
y ,
1998
2083
X_units ,
@@ -2465,16 +2550,6 @@ def latex_table(
2465
2550
return with_preamble (table_string )
2466
2551
2467
2552
2468
- def _suggest_keywords (cls , k : str ) -> List [str ]:
2469
- valid_keywords = [
2470
- param
2471
- for param in inspect .signature (cls .__init__ ).parameters
2472
- if param not in ["self" , "kwargs" ]
2473
- ]
2474
- suggestions = difflib .get_close_matches (k , valid_keywords , n = 3 )
2475
- return suggestions
2476
-
2477
-
2478
2553
def idx_model_selection (equations : pd .DataFrame , model_selection : str ):
2479
2554
"""Select an expression and return its index."""
2480
2555
if model_selection == "accuracy" :
0 commit comments