Skip to content

glum LASSO fit not finishing when same problem fits within 1 second in glmnet #709

Open
@tomwenseleers

Description

@tomwenseleers

Was just testing glum, but ran into a problem where a glum LASSO fit does not appear to finish (or if it does extremely slowly). The data I simulated was a spike train blurred with a Gaussian point spread function and with some Gaussian noise added with n=p=10000 and the sparse covariate matrix X then consisting of 10000 time-shifted copies of a Gaussian point spread function (i.e. it's a banded matrix) and 1000 out of 10000 coefficients being nonzero (all positive, ie using nonnegativity constraints) :
https://github.com/tomwenseleers/glmnet-benchmark/tree/master/data

However, if I try to fit a LASSO fit using glum it never appears to finish (or if it does, it would do so extremely slowly). The syntax I used is this one - am I doing something wrong here ?
https://github.com/tomwenseleers/glmnet-benchmark/blob/master/bench_glum.py

import numpy as np
from scipy.sparse import csc_matrix
from scipy.io import mmread
import os
import time
from glum import GeneralizedLinearRegressorCV
from glum import GeneralizedLinearRegressor

def get_data(n, p, data_prefix):
    # Create file paths
    X_file = os.path.join(data_prefix, f"{n}_{p}_X.mtx")
    y_file = os.path.join(data_prefix, f"{n}_{p}_y.csv")
    beta_true_file = os.path.join(data_prefix, f"{n}_{p}_beta_true.csv")
    
    # Read the data
    X = csc_matrix(mmread(X_file))  # Read .mtx file and convert to CSC format
    y = np.genfromtxt(y_file, delimiter=',', skip_header=1)
    beta_true = np.genfromtxt(beta_true_file, delimiter=',', skip_header=1)
    
    return X, y, beta_true

def timer_glr(X, y, glr):
    start = time.time()
    glr_fit = glr.fit(X, y)
    end = time.time()
    return glr_fit, end-start

# load simulated data (spike train convoluted with gaussian point spread function
# with 1000 / 10000 coefficients being nonzero (all positive) and Gaussian noise
# added
n = 10000
p = 10000
X_sparse, y, beta_true = get_data(n, p, 'data')

# determine optimal alpha using cross validation
glrcv = GeneralizedLinearRegressorCV(l1_ratio=1, # lasso
          family='normal',
          fit_intercept=False,
          # gradient_tol=1e-7,
          scale_predictors=False,
          lower_bounds = np.zeros(p), # nonnegativity constraints
          min_alpha_ratio=0.01 if n < p else 1e-4,
          solver='irls-cd',
          cv=10, # use 10-fold CV as in glmnet
          max_iter=10000,
          n_alphas=100 # as in glmnet
          )
glrcv_fit, elapsed = timer_glr(X_sparse, y, glrcv)
print("Coef:\n", glrcv_fit.coef_)
print("Intercept: ", glrcv_fit.intercept_)
print("N_iter: ", glrcv_fit.n_iter_)
print("Elapsed: ", elapsed)
# now refit on complete dataset using optimal alpha
glr = GeneralizedLinearRegressor(l1_ratio=1, # lasso
          family='normal',
          fit_intercept=False,
          # gradient_tol=1e-7,
          scale_predictors=False,
          lower_bounds = np.zeros(p), # nonnegativity constraints
          min_alpha_ratio=0.01 if n < p else 1e-4,
          solver='irls-cd',
          max_iter=10000,
          alpha=glrcv_fit.alpha_
          #n_alphas=100,
          #alpha_search=True
          )
glr_fit, elapsed = timer_glr(X_sparse, y, glr)
print("Coef:\n", glr_fit.coef_)
print("Intercept: ", glr_fit.intercept_)
print("N_iter: ", glr_fit.n_iter_)
print("Elapsed: ", elapsed)

If I fit the same problem in glmnet (version glmnet_4.1-8, which is the recent version redone in Rcpp) it finishes in less than a second:
https://github.com/tomwenseleers/glmnet-benchmark/blob/master/bench_glmnet.R

library(glmnet)
library(microbenchmark)
library(Matrix)
library(L0glm) # my own experimental GLM package to fit L0 penalized GLMs
library(export)

setwd("~/Github/glmnet-benchmark")

data_prefix = 'data'
n = 10000L
p = 10000L
 
get_data = function(n, p) {
  list(X=as(as(readMM(paste0(data_prefix, '/', n, '_', p, '_X.mtx')), "CsparseMatrix"), "dgCMatrix"), 
       y=read.csv(paste0(data_prefix, '/', n, '_', p, '_y.csv'), header=T),
       beta_true=read.csv(paste0(data_prefix, '/', n, '_', p, '_beta_true.csv'), header=T))
}

timer_glmnet = function(X, y) {
  time.out = microbenchmark({  cvfit <- cv.glmnet(X, y, family='gaussian', 
                                                  tol=1e-14, standardize=F, 
                                                  alpha=1, # LASSO
                                                  lower.limits=0, # nonnegativity constraints
                                                  intercept=FALSE, # no intercept
                                                  nlambda = 100) 
  fit <- glmnet(X, y, family='gaussian', 
                tol=1e-14, standardize=F, 
                alpha=1, # LASSO
                lower.limits=0, # nonnegativity constraints
                intercept=FALSE, # no intercept
                nlambda = 100) }, 
  times=1L, 
  unit='s')
  coefs = coef(fit, s=cvfit$lambda.min)
  
  list(fit=fit, coefs=coefs, elapsed=summary(time.out)$mean)
}


dat = get_data(n, p)
X_dense = as.matrix(dat$X)
X_sparse = dat$X
y = dat$y[,1]
coefs_true = dat$beta_true[,1]

## timings for glmnet when fit as sparse system ####
out_sparse = timer_glmnet(X_sparse, y)
R = cor(as.vector(out_sparse$coefs)[-1], coefs_true)
print("Correlation between estimated & true coefs:\n")
print(round(R, 4)) 
print(paste("N_iter:", out_sparse$fit$npasses)) 
print(paste("Elapsed:", out_sparse$elapsed))
# [1] "Correlation between estimated & true coefs:\n"
# > print(round(R, 4)) 
# [1] 0.9092
# > print(paste("N_iter:", out_sparse$fit$npasses)) 
# [1] "N_iter: 762"
# > print(paste("Elapsed:", out_dense$elapsed))
# [1] "Elapsed: 0.9451085"

plot(x=as.vector(out_sparse$coefs)[-1], y=coefs_true, pch=16, col='steelblue',
     xlab="estimated coefficients", ylab="true coefficients",
     main=paste0("glmnet nonnegative LASSO regression\n(n=", n,", p=", p,", R=", round(R,4), ")"))
dir.create("./plots/")
graph2png(file="./plots/glmnet_LASSO_true_vs_estimated_coefs.png", width=7, height=5)

table(as.vector(out_sparse$coefs)[-1]!=0, 
      coefs_true!=0, dnn=c("estimated beta nonzero", "true beta nonzero"))
#                 true beta nonzero
# estimated beta nonzero FALSE TRUE
#                  FALSE  7929  195
#                  TRUE   1071  805

glmnet_LASSO_true_vs_estimated_coefs

Fitting it with the covariate matrix coded as dense is slower, but still doable:

## timings for glmnet when fit as dense system ####
out_dense = timer_glmnet(X_dense, y)
R = cor(as.vector(out_dense$coefs)[-1], coefs_true)
print("Correlation between estimated & true coefs:\n")
print(round(R, 4)) 
print(paste("N_iter:", out_dense$fit$npasses)) 
print(paste("Elapsed:", out_dense$elapsed))
# [1] "Correlation between estimated & true coefs:\n"
# > print(round(R, 4)) 
# [1] 0.9063
# > print(paste("N_iter:", out_dense$fit$npasses)) 
# [1] "N_iter: 762"
# > print(paste("Elapsed:", out_dense$elapsed))
# [1] "Elapsed: 86.74697"

And incidentally, my own R package that's in development (L0glm), to fit L0 penalized GLMs using an iterative adaptive ridge procedure, can fit it 4x faster still than glmnet & also returns a much better quality solution with far fewer false positives & more true positives & better predictive performance (this is using a very simple algorithm where the weighted least square step in the GLM IRLS algorithm is replaced with a ridge regression with adaptive penalties, and using the eigen Cholesky LLT solver) :

system.time(L0glm_sparse_chol <- L0glm.fit(X_sparse, y, 
                                      family = gaussian(identity), 
                                      lambda = "gic", # options "aic", "bic", "gic", "ebic", "mbic"
                                      nonnegative = TRUE,
                                      solver = "eigen",
                                      method = 7L) # Cholesky LLT
) # 0.18s, i.e. 4x faster than sparse glmnet LASSO fit

R = cor(as.vector(L0glm_sparse_chol$coefficients), coefs_true)
print("Correlation between estimated & true coefs:\n")
print(round(R, 4)) # 0.9929 - much better solution than LASSO

plot(x=as.vector(L0glm_sparse_chol$coefficients), y=coefs_true, pch=16, col='steelblue',
     xlab="estimated coefficients", ylab="true coefficients",
     main=paste0("L0glm regression\n(n=", n,", p=", p, ", R=", round(R,4), ")"))
graph2png(file="./plots/L0glm_true_vs_estimated_coefs.png", width=7, height=5)

table(as.vector(L0glm_sparse_chol$coefficients)!=0, 
      coefs_true!=0, dnn=c("estimated beta nonzero", "true beta nonzero"));
# better true positive rate & much lower false positive rate than LASSO
#                 true beta nonzero
# estimated beta nonzero FALSE TRUE
#                  FALSE  8919   45
#                  TRUE     81  955

L0glm_true_vs_estimated_coefs

Any thoughts what I might be doing wrong in my usage of glum? Or if this is rather a bug?

Also, would it be possible to update the included benchmarks by any chance with a wider range of simulated data, e.g. generated using abess's function make_glm_data in
https://github.com/abess-team/abess/blob/master/python/abess/datasets.py
or in R
https://github.com/abess-team/abess/blob/master/R-package/R/generate.data.R ? E.g. for varying n & p & different correlation structures & error families?
Right now I feel the included benchmarks are out of date with developments in other packages...

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions