Skip to content

Commit c2ab38b

Browse files
authored
Merge pull request #649 from MilesCranmer/var-complexity
Per-variable custom complexities
2 parents 96d6ea9 + cabda12 commit c2ab38b

File tree

10 files changed

+209
-46
lines changed

10 files changed

+209
-46
lines changed

.github/workflows/CI.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ jobs:
9090
- name: "Coveralls"
9191
env:
9292
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
93-
COVERALLS_FLAG_NAME: test-${{ matrix.julia-version }}-${{ matrix.python-version }}
93+
COVERALLS_FLAG_NAME: test-${{ matrix.julia-version }}-${{ matrix.python-version }}-${{ matrix.test-id }}
9494
COVERALLS_PARALLEL: true
9595
run: coveralls --service=github
9696

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "pysr"
7-
version = "0.18.4"
7+
version = "0.18.5"
88
authors = [
99
{name = "Miles Cranmer", email = "[email protected]"},
1010
]
@@ -41,4 +41,5 @@ dev-dependencies = [
4141
"pandas-stubs>=2.2.1.240316",
4242
"types-pytz>=2024.1.0.20240417",
4343
"types-openpyxl>=3.1.0.20240428",
44+
"coverage>=7.5.3",
4445
]

pysr/juliapkg.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"packages": {
44
"SymbolicRegression": {
55
"uuid": "8254be44-1295-4e6a-a16d-46603ac705cb",
6-
"version": "=0.24.4"
6+
"version": "=0.24.5"
77
},
88
"Serialization": {
99
"uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b",

pysr/sr.py

+100-25
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
"""Define the PySRRegressor scikit-learn interface."""
22

33
import copy
4-
import difflib
5-
import inspect
64
import os
75
import pickle as pkl
86
import re
@@ -57,6 +55,7 @@
5755
_preprocess_julia_floats,
5856
_safe_check_feature_names_in,
5957
_subscriptify,
58+
_suggest_keywords,
6059
)
6160

6261
ALREADY_RAN = False
@@ -122,7 +121,7 @@ def _maybe_create_inline_operators(
122121
"and underscores are allowed."
123122
)
124123
if (extra_sympy_mappings is None) or (
125-
not function_name in extra_sympy_mappings
124+
function_name not in extra_sympy_mappings
126125
):
127126
raise ValueError(
128127
f"Custom function {function_name} is not defined in `extra_sympy_mappings`. "
@@ -139,6 +138,7 @@ def _check_assertions(
139138
X,
140139
use_custom_variable_names,
141140
variable_names,
141+
complexity_of_variables,
142142
weights,
143143
y,
144144
X_units,
@@ -163,6 +163,13 @@ def _check_assertions(
163163
"and underscores are allowed."
164164
)
165165
assert_valid_sympy_symbol(var_name)
166+
if (
167+
isinstance(complexity_of_variables, list)
168+
and len(complexity_of_variables) != X.shape[1]
169+
):
170+
raise ValueError(
171+
"The number of elements in `complexity_of_variables` must equal the number of features in `X`."
172+
)
166173
if X_units is not None and len(X_units) != X.shape[1]:
167174
raise ValueError(
168175
"The number of units in `X_units` must equal the number of features in `X`."
@@ -333,7 +340,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
333340
`idx` argument to the function, which is `nothing`
334341
for non-batched, and a 1D array of indices for batched.
335342
Default is `None`.
336-
complexity_of_operators : dict[str, float]
343+
complexity_of_operators : dict[str, Union[int, float]]
337344
If you would like to use a complexity other than 1 for an
338345
operator, specify the complexity here. For example,
339346
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
@@ -342,10 +349,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
342349
numbers for a complexity, and the total complexity of a tree
343350
will be rounded to the nearest integer after computing.
344351
Default is `None`.
345-
complexity_of_constants : float
352+
complexity_of_constants : int | float
346353
Complexity of constants. Default is `1`.
347-
complexity_of_variables : float
348-
Complexity of variables. Default is `1`.
354+
complexity_of_variables : int | float
355+
Global complexity of variables. To set different complexities for
356+
different variables, pass a list of complexities to the `fit` method
357+
with keyword `complexity_of_variables`. You cannot use both.
358+
Default is `1`.
349359
parsimony : float
350360
Multiplicative factor for how much to punish complexity.
351361
Default is `0.0032`.
@@ -691,6 +701,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
691701
n_features_in_: int
692702
feature_names_in_: ArrayLike[str]
693703
display_feature_names_in_: ArrayLike[str]
704+
complexity_of_variables_: Union[int, float, List[Union[int, float]], None]
694705
X_units_: Union[ArrayLike[str], None]
695706
y_units_: Union[str, ArrayLike[str], None]
696707
nout_: int
@@ -722,7 +733,7 @@ def __init__(
722733
loss_function: Optional[str] = None,
723734
complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
724735
complexity_of_constants: Union[int, float] = 1,
725-
complexity_of_variables: Union[int, float] = 1,
736+
complexity_of_variables: Optional[Union[int, float]] = None,
726737
parsimony: float = 0.0032,
727738
dimensional_constraint_penalty: Optional[float] = None,
728739
dimensionless_constants_only: bool = False,
@@ -1344,13 +1355,22 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams:
13441355
return param_container
13451356

13461357
def _validate_and_set_fit_params(
1347-
self, X, y, Xresampled, weights, variable_names, X_units, y_units
1358+
self,
1359+
X,
1360+
y,
1361+
Xresampled,
1362+
weights,
1363+
variable_names,
1364+
complexity_of_variables,
1365+
X_units,
1366+
y_units,
13481367
) -> Tuple[
13491368
ndarray,
13501369
ndarray,
13511370
Optional[ndarray],
13521371
Optional[ndarray],
13531372
ArrayLike[str],
1373+
Union[int, float, List[Union[int, float]]],
13541374
Optional[ArrayLike[str]],
13551375
Optional[Union[str, ArrayLike[str]]],
13561376
]:
@@ -1375,6 +1395,8 @@ def _validate_and_set_fit_params(
13751395
for that particular element of y.
13761396
variable_names : ndarray of length n_features
13771397
Names of each variable in the training dataset, `X`.
1398+
complexity_of_variables : int | float | list[int | float]
1399+
Complexity of each variable in the training dataset, `X`.
13781400
X_units : list[str] of length n_features
13791401
Units of each variable in the training dataset, `X`.
13801402
y_units : str | list[str] of length n_out
@@ -1422,6 +1444,22 @@ def _validate_and_set_fit_params(
14221444
"Please use valid names instead."
14231445
)
14241446

1447+
if (
1448+
complexity_of_variables is not None
1449+
and self.complexity_of_variables is not None
1450+
):
1451+
raise ValueError(
1452+
"You cannot set `complexity_of_variables` at both `fit` and `__init__`. "
1453+
"Pass it at `__init__` to set it to global default, OR use `fit` to set it for "
1454+
"each variable individually."
1455+
)
1456+
elif complexity_of_variables is not None:
1457+
complexity_of_variables = complexity_of_variables
1458+
elif self.complexity_of_variables is not None:
1459+
complexity_of_variables = self.complexity_of_variables
1460+
else:
1461+
complexity_of_variables = 1
1462+
14251463
# Data validation and feature name fetching via sklearn
14261464
# This method sets the n_features_in_ attribute
14271465
if Xresampled is not None:
@@ -1452,10 +1490,20 @@ def _validate_and_set_fit_params(
14521490
else:
14531491
raise NotImplementedError("y shape not supported!")
14541492

1493+
self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
14551494
self.X_units_ = copy.deepcopy(X_units)
14561495
self.y_units_ = copy.deepcopy(y_units)
14571496

1458-
return X, y, Xresampled, weights, variable_names, X_units, y_units
1497+
return (
1498+
X,
1499+
y,
1500+
Xresampled,
1501+
weights,
1502+
variable_names,
1503+
complexity_of_variables,
1504+
X_units,
1505+
y_units,
1506+
)
14591507

14601508
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
14611509
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
@@ -1471,6 +1519,7 @@ def _pre_transform_training_data(
14711519
y: ndarray,
14721520
Xresampled: Union[ndarray, None],
14731521
variable_names: ArrayLike[str],
1522+
complexity_of_variables: Union[int, float, List[Union[int, float]]],
14741523
X_units: Union[ArrayLike[str], None],
14751524
y_units: Union[ArrayLike[str], str, None],
14761525
random_state: np.random.RandomState,
@@ -1493,6 +1542,8 @@ def _pre_transform_training_data(
14931542
variable_names : list[str]
14941543
Names of each variable in the training dataset, `X`.
14951544
Of length `n_features`.
1545+
complexity_of_variables : int | float | list[int | float]
1546+
Complexity of each variable in the training dataset, `X`.
14961547
X_units : list[str]
14971548
Units of each variable in the training dataset, `X`.
14981549
y_units : str | list[str]
@@ -1543,6 +1594,14 @@ def _pre_transform_training_data(
15431594
],
15441595
)
15451596

1597+
if isinstance(complexity_of_variables, list):
1598+
complexity_of_variables = [
1599+
complexity_of_variables[i]
1600+
for i in range(len(complexity_of_variables))
1601+
if selection_mask[i]
1602+
]
1603+
self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
1604+
15461605
if X_units is not None:
15471606
X_units = cast(
15481607
ArrayLike[str],
@@ -1567,7 +1626,7 @@ def _pre_transform_training_data(
15671626
else:
15681627
X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
15691628

1570-
return X, y, variable_names, X_units, y_units
1629+
return X, y, variable_names, complexity_of_variables, X_units, y_units
15711630

15721631
def _run(
15731632
self,
@@ -1624,6 +1683,7 @@ def _run(
16241683

16251684
nested_constraints = self.nested_constraints
16261685
complexity_of_operators = self.complexity_of_operators
1686+
complexity_of_variables = self.complexity_of_variables_
16271687
cluster_manager = self.cluster_manager
16281688

16291689
# Start julia backend processes
@@ -1668,6 +1728,9 @@ def _run(
16681728
complexity_of_operators = jl.seval(complexity_of_operators_str)
16691729
# TODO: Refactor this into helper function
16701730

1731+
if isinstance(complexity_of_variables, list):
1732+
complexity_of_variables = jl_array(complexity_of_variables)
1733+
16711734
custom_loss = jl.seval(
16721735
str(self.elementwise_loss)
16731736
if self.elementwise_loss is not None
@@ -1726,7 +1789,7 @@ def _run(
17261789
una_constraints=jl_array(una_constraints),
17271790
complexity_of_operators=complexity_of_operators,
17281791
complexity_of_constants=self.complexity_of_constants,
1729-
complexity_of_variables=self.complexity_of_variables,
1792+
complexity_of_variables=complexity_of_variables,
17301793
nested_constraints=nested_constraints,
17311794
elementwise_loss=custom_loss,
17321795
loss_function=custom_full_objective,
@@ -1871,6 +1934,9 @@ def fit(
18711934
Xresampled=None,
18721935
weights=None,
18731936
variable_names: Optional[ArrayLike[str]] = None,
1937+
complexity_of_variables: Optional[
1938+
Union[int, float, List[Union[int, float]]]
1939+
] = None,
18741940
X_units: Optional[ArrayLike[str]] = None,
18751941
y_units: Optional[Union[str, ArrayLike[str]]] = None,
18761942
) -> "PySRRegressor":
@@ -1931,6 +1997,7 @@ def fit(
19311997
self.selection_mask_ = None
19321998
self.julia_state_stream_ = None
19331999
self.julia_options_stream_ = None
2000+
self.complexity_of_variables_ = None
19342001
self.X_units_ = None
19352002
self.y_units_ = None
19362003

@@ -1944,10 +2011,18 @@ def fit(
19442011
Xresampled,
19452012
weights,
19462013
variable_names,
2014+
complexity_of_variables,
19472015
X_units,
19482016
y_units,
19492017
) = self._validate_and_set_fit_params(
1950-
X, y, Xresampled, weights, variable_names, X_units, y_units
2018+
X,
2019+
y,
2020+
Xresampled,
2021+
weights,
2022+
variable_names,
2023+
complexity_of_variables,
2024+
X_units,
2025+
y_units,
19512026
)
19522027

19532028
if X.shape[0] > 10000 and not self.batching:
@@ -1965,8 +2040,17 @@ def fit(
19652040
seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
19662041

19672042
# Pre transformations (feature selection and denoising)
1968-
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
1969-
X, y, Xresampled, variable_names, X_units, y_units, random_state
2043+
X, y, variable_names, complexity_of_variables, X_units, y_units = (
2044+
self._pre_transform_training_data(
2045+
X,
2046+
y,
2047+
Xresampled,
2048+
variable_names,
2049+
complexity_of_variables,
2050+
X_units,
2051+
y_units,
2052+
random_state,
2053+
)
19702054
)
19712055

19722056
# Warn about large feature counts (still warn if feature count is large
@@ -1993,6 +2077,7 @@ def fit(
19932077
X,
19942078
use_custom_variable_names,
19952079
variable_names,
2080+
complexity_of_variables,
19962081
weights,
19972082
y,
19982083
X_units,
@@ -2465,16 +2550,6 @@ def latex_table(
24652550
return with_preamble(table_string)
24662551

24672552

2468-
def _suggest_keywords(cls, k: str) -> List[str]:
2469-
valid_keywords = [
2470-
param
2471-
for param in inspect.signature(cls.__init__).parameters
2472-
if param not in ["self", "kwargs"]
2473-
]
2474-
suggestions = difflib.get_close_matches(k, valid_keywords, n=3)
2475-
return suggestions
2476-
2477-
24782553
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
24792554
"""Select an expression and return its index."""
24802555
if model_selection == "accuracy":

pysr/test/params.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import inspect
22

3-
from .. import PySRRegressor
3+
from pysr import PySRRegressor
44

55
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
66
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default

0 commit comments

Comments
 (0)