Release

cerlymarco · cerlymarco · commit 6b76c6140eec · 2022-07-21T10:55:22.000+02:00
diff --git a/lineartree/_classes.py b/lineartree/_classes.py
@@ -152,7 +152,7 @@ def _parallel_binning_fit(split_feat, _self, X, y,
                                    weights=weights[mask], **largs_right)
                 wloss_right = loss_right * (weights[mask].sum() / weights.sum())
 
-            total_loss = wloss_left + wloss_right
+            total_loss = round(wloss_left + wloss_right, 5)
 
             # store if best
             if total_loss < loss:
@@ -214,15 +214,16 @@ class _LinearTree(BaseEstimator):
     """
     def __init__(self, base_estimator, *, criterion, max_depth,
                  min_samples_split, min_samples_leaf, max_bins,
-                 categorical_features, split_features,
-                 linear_features, n_jobs):
+                 min_impurity_decrease, categorical_features,
+                 split_features, linear_features, n_jobs):
 
         self.base_estimator = base_estimator
         self.criterion = criterion
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.max_bins = max_bins
+        self.min_impurity_decrease = min_impurity_decrease
         self.categorical_features = categorical_features
         self.split_features = split_features
         self.linear_features = linear_features
@@ -295,7 +296,7 @@ def _split(self, X, y, bins,
 
         # select best results
         _id_best = np.argmin(_losses)
-        if _losses[_id_best] < loss:
+        if loss - _losses[_id_best] > self.min_impurity_decrease:
             split_t = split_t[_id_best]
             split_col = split_col[_id_best]
             left_node = left_node[_id_best]
@@ -362,6 +363,7 @@ def _grow(self, X, y, weights=None):
         loss = CRITERIA[self.criterion](
             model, X[:, self._linear_features], y,
             weights=weights, **largs)
+        loss = round(loss, 5)
 
         self._nodes[''] = Node(
             id=0,
@@ -651,8 +653,8 @@ def summary(self, feature_names=None, only_leaves=False, max_depth=None):
 
                 summary[N.id] = {
                     'col': feature_names[Cl.threshold[-1][0]],
-                    'th': round(Cl.threshold[-1][-1], 4),
-                    'loss': round(Cl.w_loss + Cr.w_loss, 4),
+                    'th': round(Cl.threshold[-1][-1], 5),
+                    'loss': round(Cl.w_loss + Cr.w_loss, 5),
                     'samples': Cl.n_samples + Cr.n_samples,
                     'children': (Cl.id, Cr.id),
                     'models': (Cl.model, Cr.model)
@@ -664,7 +666,7 @@ def summary(self, feature_names=None, only_leaves=False, max_depth=None):
                 continue
 
             summary[L.id] = {
-                'loss': round(L.loss, 4),
+                'loss': round(L.loss, 5),
                 'samples': L.n_samples,
                 'models': L.model
             }
diff --git a/lineartree/lineartree.py b/lineartree/lineartree.py
@@ -59,6 +59,10 @@ class LinearTreeRegressor(_LinearTree, RegressorMixin):
         ``max_bins`` bins. Must be lower than 120 and larger than 10.
         A higher value implies a higher training time.
 
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
     categorical_features : int or array-like of int, default=None
         Indicates the categorical features.
         All categorical indices must be in `[0, n_features)`.
@@ -119,15 +123,16 @@ class LinearTreeRegressor(_LinearTree, RegressorMixin):
     """
     def __init__(self, base_estimator, *, criterion='mse', max_depth=5,
                  min_samples_split=6, min_samples_leaf=0.1, max_bins=25,
-                 categorical_features=None, split_features=None,
-                 linear_features=None, n_jobs=None):
+                 min_impurity_decrease=0.0, categorical_features=None,
+                 split_features=None, linear_features=None, n_jobs=None):
 
         self.base_estimator = base_estimator
         self.criterion = criterion
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.max_bins = max_bins
+        self.min_impurity_decrease = min_impurity_decrease
         self.categorical_features = categorical_features
         self.split_features = split_features
         self.linear_features = linear_features
@@ -281,6 +286,10 @@ class LinearTreeClassifier(_LinearTree, ClassifierMixin):
         ``max_bins`` bins. Must be lower than 120 and larger than 10.
         A higher value implies a higher training time.
 
+    min_impurity_decrease : float, default=0.0
+        A node will be split if this split induces a decrease of the impurity
+        greater than or equal to this value.
+
     categorical_features : int or array-like of int, default=None
         Indicates the categorical features.
         All categorical indices must be in `[0, n_features)`.
@@ -341,15 +350,16 @@ class LinearTreeClassifier(_LinearTree, ClassifierMixin):
     """
     def __init__(self, base_estimator, *, criterion='hamming', max_depth=5,
                  min_samples_split=6, min_samples_leaf=0.1, max_bins=25,
-                 categorical_features=None, split_features=None,
-                 linear_features=None, n_jobs=None):
+                 min_impurity_decrease=0.0, categorical_features=None,
+                 split_features=None, linear_features=None, n_jobs=None):
 
         self.base_estimator = base_estimator
         self.criterion = criterion
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.max_bins = max_bins
+        self.min_impurity_decrease = min_impurity_decrease
         self.categorical_features = categorical_features
         self.split_features = split_features
         self.linear_features = linear_features
diff --git a/notebooks/README.md b/notebooks/README.md
@@ -43,6 +43,10 @@ class lineartree.LinearTreeRegressor(base_estimator, *, criterion = 'mse', max_d
 
     The maximum number of bins to use to search the optimal split in each feature. Features with a small number of unique values may use less than ``max_bins`` bins. Must be lower than 120 and larger than 10. 
     A higher value implies a higher training time. 
+    
+- ```min_impurity_decrease : float, default=0.0```
+
+    A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
 
 - ```categorical_features : int or array-like of int, default=None```
 
@@ -279,6 +283,10 @@ class lineartree.LinearTreeClassifier(base_estimator, *, criterion = 'hamming',
 
     The maximum number of bins to use to search the optimal split in each feature. Features with a small number of unique values may use less than ``max_bins`` bins. Must be lower than 120 and larger than 10. 
     A higher value implies a higher training time. 
+    
+- ```min_impurity_decrease : float, default=0.0```
+
+    A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
 
 - ```categorical_features : int or array-like of int, default=None```
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 HERE = pathlib.Path(__file__).parent
 
-VERSION = '0.3.3'
+VERSION = '0.3.4'
 PACKAGE_NAME = 'linear-tree'
 AUTHOR = 'Marco Cerliani'
 AUTHOR_EMAIL = 'cerlymarco@gmail.com'