FEAT implement sparse group lasso penalty and ws_strategy="fixpoint" for BCD (#267)

mathurinm · web-flow · commit d6ab8c2618c4 · 2024-07-14T20:37:30.000+02:00
diff --git a/examples/plot_group_logistic_regression.py b/examples/plot_group_logistic_regression.py
@@ -41,4 +41,4 @@
 
 # %%
 # Fit check that groups are either all 0 or all non zero
-print(clf.coef_.reshape(-1, grp_size))
+print(clf.coef_.reshape(-1, grp_size))
diff --git a/examples/plot_logreg_various_penalties.py b/examples/plot_logreg_various_penalties.py
@@ -55,23 +55,20 @@
     clf_enet.coef_[clf_enet.coef_ != 0],
     markerfmt="x",
     label="Elastic net coefficients",
-    use_line_collection=True,
 )
 plt.setp([m, s], color="#2ca02c")
 m, s, _ = plt.stem(
     np.where(clf_mcp.coef_.ravel())[0],
     clf_mcp.coef_[clf_mcp.coef_ != 0],
     markerfmt="x",
     label="MCP coefficients",
-    use_line_collection=True,
 )
 plt.setp([m, s], color="#ff7f0e")
 plt.stem(
     np.where(w_star)[0],
     w_star[w_star != 0],
     label="true coefficients",
     markerfmt="bx",
-    use_line_collection=True,
 )
 
 plt.legend(loc="best")
diff --git a/examples/plot_sparse_group_lasso.py b/examples/plot_sparse_group_lasso.py
@@ -0,0 +1,61 @@
+"""
+=================================
+Fast Sparse Group Lasso in python
+=================================
+Scikit-learn is missing a Sparse Group Lasso regression estimator. We show how to
+implement one with ``skglm``.
+"""
+
+# Author: Mathurin Massias
+
+# %%
+import numpy as np
+import matplotlib.pyplot as plt
+
+from skglm.solvers import GroupBCD
+from skglm.datafits import QuadraticGroup
+from skglm import GeneralizedLinearEstimator
+from skglm.penalties import WeightedL1GroupL2
+from skglm.utils.data import make_correlated_data, grp_converter
+
+n_features = 30
+X, y, _ = make_correlated_data(
+    n_samples=10, n_features=30, random_state=0)
+
+
+# %%
+# Model creation: combination of penalty, datafit and solver.
+#
+# penalty:
+grp_size = 10  # take groups of 10 consecutive features
+n_groups = n_features // grp_size
+grp_indices, grp_ptr = grp_converter(grp_size, n_features)
+n_groups = len(grp_ptr) - 1
+weights_g = np.ones(n_groups, dtype=np.float64)
+weights_f = 0.5 * np.ones(n_features)
+penalty = WeightedL1GroupL2(
+    alpha=0.5, weights_groups=weights_g,
+    weights_features=weights_f, grp_indices=grp_indices, grp_ptr=grp_ptr)
+
+# %% Datafit and solver
+datafit = QuadraticGroup(grp_ptr, grp_indices)
+solver = GroupBCD(ws_strategy="fixpoint", verbose=1, fit_intercept=False, tol=1e-10)
+
+model = GeneralizedLinearEstimator(datafit, penalty, solver=solver)
+
+# %%
+# Train the model
+clf = GeneralizedLinearEstimator(datafit, penalty, solver)
+clf.fit(X, y)
+
+# %%
+# Some groups are fully 0, and inside non zero groups,
+# some values are 0 too
+plt.imshow(clf.coef_.reshape(-1, grp_size) != 0, cmap='Greys')
+plt.title("Non zero values (in black) in model coefficients")
+plt.ylabel('Group index')
+plt.xlabel('Feature index inside group')
+plt.xticks(np.arange(grp_size))
+plt.yticks(np.arange(n_groups));
+
+# %%
diff --git a/skglm/penalties/__init__.py b/skglm/penalties/__init__.py
@@ -4,7 +4,7 @@
     WeightedL1, IndicatorBox, PositiveConstraint, LogSumPenalty
 )
 from .block_separable import (
-    L2_05, L2_1, BlockMCPenalty, BlockSCAD, WeightedGroupL2
+    L2_05, L2_1, BlockMCPenalty, BlockSCAD, WeightedGroupL2, WeightedL1GroupL2
 )
 
 from .non_separable import SLOPE
@@ -14,5 +14,5 @@
     BasePenalty,
     L1_plus_L2, L0_5, L1, L2, L2_3, MCPenalty, WeightedMCPenalty, SCAD, WeightedL1,
     IndicatorBox, PositiveConstraint, L2_05, L2_1, BlockMCPenalty, BlockSCAD,
-    WeightedGroupL2, SLOPE, LogSumPenalty
+    WeightedGroupL2, WeightedL1GroupL2, SLOPE, LogSumPenalty
 ]
diff --git a/skglm/penalties/block_separable.py b/skglm/penalties/block_separable.py
@@ -6,7 +6,7 @@
 
 from skglm.penalties.base import BasePenalty
 from skglm.utils.prox_funcs import (
-    BST, prox_block_2_05, prox_SCAD, value_SCAD, prox_MCP, value_MCP)
+    BST, ST_vec, prox_block_2_05, prox_SCAD, value_SCAD, prox_MCP, value_MCP)
 
 
 class L2_1(BasePenalty):
@@ -382,3 +382,109 @@ def generalized_support(self, w):
                 gsupp[g] = True
 
         return gsupp
+
+
+class WeightedL1GroupL2(BasePenalty):
+    r"""Weighted Group L2 penalty, aka sparse group Lasso.
+
+    The penalty reads
+
+    .. math::
+        sum_{g=1}^{n_"groups"} "weights"^1_g ||w_{[g]}|| +
+        sum_{j=1}^{n_"features"} "weights"^2_j ||w_{j}||
+
+    with :math:`w_{[g]}` being the coefficients of the g-th group and
+
+    Attributes
+    ----------
+    alpha : float
+        The regularization parameter.
+
+    weights_groups : array, shape (n_groups,)
+        The penalization weights of the groups.
+
+    weights_features : array, shape (n_features,)
+        The penalization weights of the features.
+
+    grp_indices : array, shape (n_features,)
+        The group indices stacked contiguously
+        ([grp1_indices, grp2_indices, ...]).
+
+    grp_ptr : array, shape (n_groups + 1,)
+        The group pointers such that two consecutive elements delimit
+        the indices of a group in ``grp_indices``.
+
+    """
+
+    def __init__(
+            self, alpha, weights_groups, weights_features, grp_ptr, grp_indices):
+        self.alpha = alpha
+        self.grp_ptr, self.grp_indices = grp_ptr, grp_indices
+        self.weights_groups = weights_groups
+        self.weights_features = weights_features
+
+    def get_spec(self):
+        spec = (
+            ('alpha', float64),
+            ('weights_groups', float64[:]),
+            ('weights_features', float64[:]),
+            ('grp_ptr', int32[:]),
+            ('grp_indices', int32[:]),
+        )
+        return spec
+
+    def params_to_dict(self):
+        return dict(alpha=self.alpha, weights_features=self.weights_features,
+                    weights_groups=self.weights_groups, grp_ptr=self.grp_ptr,
+                    grp_indices=self.grp_indices)
+
+    def value(self, w):
+        """Value of penalty at vector ``w``."""
+        grp_ptr, grp_indices = self.grp_ptr, self.grp_indices
+        n_grp = len(grp_ptr) - 1
+
+        sum_penalty = 0.
+        for g in range(n_grp):
+            grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+            w_g = w[grp_g_indices]
+
+            sum_penalty += self.weights_groups[g] * norm(w_g)
+        sum_penalty += np.sum(self.weights_features * np.abs(w))
+
+        return self.alpha * sum_penalty
+
+    def prox_1group(self, value, stepsize, g):
+        """Compute the proximal operator of group ``g``."""
+        res = ST_vec(value, self.alpha * stepsize * self.weights_features[g])
+        return BST(res, self.alpha * stepsize * self.weights_groups[g])
+
+    def subdiff_distance(self, w, grad_ws, ws):
+        """Compute distance to the subdifferential at ``w`` of negative gradient.
+
+        Refer to :ref:`subdiff_positive_group_lasso` for details of the derivation.
+
+        Note:
+        ----
+        ``grad_ws`` is a stacked array of gradients ``[grad_ws_1, grad_ws_2, ...]``.
+        """
+        raise NotImplementedError("Too hard for now")
+
+    def is_penalized(self, n_groups):
+        return np.ones(n_groups, dtype=np.bool_)
+
+    def generalized_support(self, w):
+        grp_indices, grp_ptr = self.grp_indices, self.grp_ptr
+        n_groups = len(grp_ptr) - 1
+        is_penalized = self.is_penalized(n_groups)
+
+        gsupp = np.zeros(n_groups, dtype=np.bool_)
+        for g in range(n_groups):
+            if not is_penalized[g]:
+                gsupp[g] = True
+                continue
+
+            grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+            if np.any(w[grp_g_indices]):
+                gsupp[g] = True
+
+        return gsupp
diff --git a/skglm/solvers/anderson_cd.py b/skglm/solvers/anderson_cd.py
@@ -103,6 +103,8 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
             # The intercept is not taken into account in the optimality conditions since
             # the derivative w.r.t. to the intercept may be very large. It is not likely
             # to change significantly the optimality conditions.
+            # TODO: MM I don't understand the comment above: the intercept is
+            # taken into account intercept_opt 6 lines below
             if self.ws_strategy == "subdiff":
                 opt = penalty.subdiff_distance(w[:n_features], grad, all_feats)
             elif self.ws_strategy == "fixpoint":
diff --git a/skglm/solvers/common.py b/skglm/solvers/common.py
@@ -1,10 +1,11 @@
 import numpy as np
 from numba import njit
+from numpy.linalg import norm
 
 
 @njit
 def dist_fix_point_cd(w, grad_ws, lipschitz_ws, datafit, penalty, ws):
-    """Compute the violation of the fixed point iterate scheme.
+    """Compute the violation of the fixed point iterate scheme for CD.
 
     Parameters
     ----------
@@ -44,6 +45,60 @@ def dist_fix_point_cd(w, grad_ws, lipschitz_ws, datafit, penalty, ws):
     return dist
 
 
+@njit
+def dist_fix_point_bcd(
+        w, grad_ws, lipschitz_ws, datafit, penalty, ws):
+    """Compute the violation of the fixed point iterate scheme for BCD.
+
+    Parameters
+    ----------
+    w : array, shape (n_features,)
+        Coefficient vector.
+
+    grad_ws : array, shape (ws_size,)
+        Gradient restricted to the working set.
+
+    lipschitz_ws :  array, shape (len(ws),)
+        Coordinatewise gradient Lipschitz constants, restricted to working set.
+
+    datafit: instance of BaseDatafit
+        Datafit.
+
+    penalty: instance of BasePenalty
+        Penalty.
+
+    ws : array, shape (len(ws),)
+        The working set.
+
+    Returns
+    -------
+    dist : array, shape (n_groups,)
+        Violation score for every group.
+
+    Note:
+        ----
+        ``grad_ws`` is a stacked array of gradients ``[grad_ws_1, grad_ws_2, ...]``.
+    """
+    n_groups = len(penalty.grp_ptr) - 1
+    dist = np.zeros(n_groups, dtype=w.dtype)
+
+    grad_ptr = 0
+    for idx, g in enumerate(ws):
+        if lipschitz_ws[idx] == 0.:
+            continue
+        grp_g_indices = penalty.grp_indices[penalty.grp_ptr[g]: penalty.grp_ptr[g+1]]
+
+        grad_g = grad_ws[grad_ptr: grad_ptr + len(grp_g_indices)]
+        grad_ptr += len(grp_g_indices)
+
+        step_g = 1 / lipschitz_ws[idx]
+        w_g = w[grp_g_indices]
+        dist[idx] = norm(
+            w_g - penalty.prox_1group(w_g - grad_g * step_g, step_g, g)
+        )
+    return dist
+
+
 @njit
 def construct_grad(X, y, w, Xw, datafit, ws):
     """Compute the gradient of the datafit restricted to the working set.
diff --git a/skglm/solvers/group_bcd.py b/skglm/solvers/group_bcd.py
@@ -5,6 +5,7 @@
 from skglm.solvers.base import BaseSolver
 from skglm.utils.anderson import AndersonAcceleration
 from skglm.utils.validation import check_group_compatible
+from skglm.solvers.common import dist_fix_point_bcd
 
 
 class GroupBCD(BaseSolver):
@@ -36,17 +37,22 @@ class GroupBCD(BaseSolver):
         Amount of verbosity. 0/False is silent.
     """
 
-    def __init__(self, max_iter=1000, max_epochs=100, p0=10, tol=1e-4,
-                 fit_intercept=False, warm_start=False, verbose=0):
+    def __init__(
+            self, max_iter=1000, max_epochs=100, p0=10, tol=1e-4, fit_intercept=False,
+            warm_start=False, ws_strategy="subdiff", verbose=0):
         self.max_iter = max_iter
         self.max_epochs = max_epochs
         self.p0 = p0
         self.tol = tol
         self.fit_intercept = fit_intercept
         self.warm_start = warm_start
+        self.ws_strategy = ws_strategy
         self.verbose = verbose
 
     def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
+        if self.ws_strategy not in ("subdiff", "fixpoint"):
+            raise ValueError(
+                'Unsupported value for self.ws_strategy:', self.ws_strategy)
         check_group_compatible(datafit)
         check_group_compatible(penalty)
 
@@ -86,7 +92,14 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
                     X.data, X.indptr, X.indices, y, w, Xw, datafit, all_groups)
             else:
                 grad = _construct_grad(X, y, w, Xw, datafit, all_groups)
-            opt = penalty.subdiff_distance(w, grad, all_groups)
+
+            if self.ws_strategy == "subdiff":
+                # MM TODO: AndersonCD passes w[:n_features] here
+                opt = penalty.subdiff_distance(w, grad, all_groups)
+            elif self.ws_strategy == "fixpoint":
+                opt = dist_fix_point_bcd(
+                    w[:n_features], grad, lipschitz, datafit, penalty, all_groups
+                )
 
             if self.fit_intercept:
                 intercept_opt = np.abs(datafit.intercept_update_step(y, Xw))
@@ -144,8 +157,15 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
                     else:
                         grad_ws = _construct_grad(X, y, w, Xw, datafit, ws)
 
-                    opt_in = penalty.subdiff_distance(w, grad_ws, ws)
-                    stop_crit_in = np.max(opt_in)
+                    if self.ws_strategy == "subdiff":
+                        # TODO MM: AndersonCD uses w[:n_features] here
+                        opt_ws = penalty.subdiff_distance(w, grad_ws, ws)
+                    elif self.ws_strategy == "fixpoint":
+                        opt_ws = dist_fix_point_bcd(
+                            w, grad_ws, lipschitz[ws], datafit, penalty, ws
+                        )
+
+                    stop_crit_in = np.max(opt_ws)
 
                     if max(self.verbose - 1, 0):
                         p_obj = datafit.value(y, w, Xw) + penalty.value(w)
diff --git a/skglm/tests/test_group.py b/skglm/tests/test_group.py

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`WeightedL1, IndicatorBox, PositiveConstraint, LogSumPenalty`
`5`	`5`	`)`
`6`	`6`	`from .block_separable import (`
`7`		`- L2_05, L2_1, BlockMCPenalty, BlockSCAD, WeightedGroupL2`
	`7`	`+ L2_05, L2_1, BlockMCPenalty, BlockSCAD, WeightedGroupL2, WeightedL1GroupL2`
`8`	`8`	`)`
`9`	`9`
`10`	`10`	`from .non_separable import SLOPE`
`@@ -14,5 +14,5 @@`
`14`	`14`	`BasePenalty,`
`15`	`15`	`L1_plus_L2, L0_5, L1, L2, L2_3, MCPenalty, WeightedMCPenalty, SCAD, WeightedL1,`
`16`	`16`	`IndicatorBox, PositiveConstraint, L2_05, L2_1, BlockMCPenalty, BlockSCAD,`
`17`		`- WeightedGroupL2, SLOPE, LogSumPenalty`
	`17`	`+ WeightedGroupL2, WeightedL1GroupL2, SLOPE, LogSumPenalty`
`18`	`18`	`]`