From 6f7f1543674b46dabd11188c72ef58c51fb1f879 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 20 Mar 2019 13:22:15 +0100 Subject: [PATCH 1/5] Add n_iter_ attribute to estimators --- dask_glm/algorithms.py | 29 ++++++++++++++++++++--------- dask_glm/estimators.py | 2 +- dask_glm/utils.py | 5 +++-- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 63d7729..e2b58b6 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -98,6 +98,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): recalcRate = 10 backtrackMult = firstBacktrackMult beta = np.zeros(p) + n_iter = 0 for k in range(max_iter): # how necessary is this recalculation? @@ -125,6 +126,8 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): beta = beta - stepSize * grad # tiny bit of repeat work here to avoid communication Xbeta = Xbeta - stepSize * Xgradient + n_iter += 1 + if stepSize == 0: break @@ -136,7 +139,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): stepSize *= stepGrowth backtrackMult = nextBacktrackMult - return beta + return beta, n_iter @normalize @@ -164,7 +167,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): beta = np.zeros(p) # always init to zeros? Xbeta = dot(X, beta) - iter_count = 0 + n_iter = 0 converged = False while not converged: @@ -181,17 +184,17 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): step, _, _, _ = np.linalg.lstsq(hess, grad) beta = (beta_old - step) - iter_count += 1 + n_iter += 1 # should change this criterion coef_change = np.absolute(beta_old - beta) converged = ( - (not np.any(coef_change > tol)) or (iter_count > max_iter)) + (not np.any(coef_change > tol)) or (n_iter > max_iter)) if not converged: Xbeta = dot(X, beta) # numpy -> dask converstion of beta - return beta + return beta, n_iter @normalize @@ -256,6 +259,8 @@ def wrapped(beta, X, y, z, u, rho): u = np.array([np.zeros(p) for i in range(nchunks)]) betas = np.array([np.ones(p) for i in range(nchunks)]) + n_iter = 0 + for k in range(max_iter): # x-update step @@ -283,10 +288,12 @@ def wrapped(beta, X, y, z, u, rho): eps_dual = np.sqrt(p * nchunks) * abstol + \ reltol * np.linalg.norm(rho * u) + n_iter += 1 + if primal_res < eps_pri and dual_res < eps_dual: break - return z + return z, n_iter def local_update(X, y, beta, z, u, rho, f, fprime, solver=fmin_l_bfgs_b): @@ -349,7 +356,7 @@ def compute_loss_grad(beta, X, y): args=(X, y), iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter) - return beta + return beta, info['nit'] @normalize @@ -387,6 +394,8 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic, beta = np.zeros(p) regularizer = Regularizer.get(regularizer) + n_iter = 0 + for k in range(max_iter): # Compute the gradient if k % recalcRate == 0: @@ -400,6 +409,8 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic, obeta = beta + n_iter += 1 + # Compute the step size lf = func for ii in range(100): @@ -426,9 +437,9 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic, # L2-regularization returned a dask-array try: - return beta.compute() + return beta.compute(), n_iter except AttributeError: - return beta + return beta, n_iter _solvers = { diff --git a/dask_glm/estimators.py b/dask_glm/estimators.py index 435f7e6..4020c60 100644 --- a/dask_glm/estimators.py +++ b/dask_glm/estimators.py @@ -63,7 +63,7 @@ def __init__(self, fit_intercept=True, solver='admm', regularizer='l2', def fit(self, X, y=None): X_ = self._maybe_add_intercept(X) - self._coef = algorithms._solvers[self.solver](X_, y, **self._fit_kwargs) + self._coef, self.n_iter_ = algorithms._solvers[self.solver](X_, y, **self._fit_kwargs) if self.fit_intercept: self.coef_ = self._coef[:-1] diff --git a/dask_glm/utils.py b/dask_glm/utils.py index fd91460..2fbeac8 100644 --- a/dask_glm/utils.py +++ b/dask_glm/utils.py @@ -23,10 +23,11 @@ def normalize_inputs(X, y, *args, **kwargs): std[intercept_idx] = 1 mean = mean if len(intercept_idx[0]) else np.zeros(mean.shape) Xn = (X - mean) / std - out = algo(Xn, y, *args, **kwargs).copy() + out, n_iter = algo(Xn, y, *args, **kwargs) + out = out.copy() i_adj = np.sum(out * mean / std) out[intercept_idx] -= i_adj - return out / std + return out / std, n_iter else: return algo(X, y, *args, **kwargs) return normalize_inputs From e2d7e19785f30621107b8a2ce3cf69503da9102c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 20 Mar 2019 16:24:23 +0100 Subject: [PATCH 2/5] Fix tests, include tests to check number of iterations --- dask_glm/algorithms.py | 10 +++++--- dask_glm/tests/test_admm.py | 4 +++- dask_glm/tests/test_algos_families.py | 34 +++++++++++++++++---------- dask_glm/tests/test_utils.py | 16 ++++++------- 4 files changed, 40 insertions(+), 24 deletions(-) diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index e2b58b6..efea118 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -87,6 +87,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): Returns ------- beta : array-like, shape (n_features,) + n_iter : number of iterations executed """ loglike, gradient = family.loglike, family.gradient n, p = X.shape @@ -161,6 +162,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): Returns ------- beta : array-like, shape (n_features,) + n_iter : number of iterations executed """ gradient, hessian = family.gradient, family.hessian n, p = X.shape @@ -184,15 +186,14 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): step, _, _, _ = np.linalg.lstsq(hess, grad) beta = (beta_old - step) - n_iter += 1 - # should change this criterion coef_change = np.absolute(beta_old - beta) converged = ( - (not np.any(coef_change > tol)) or (n_iter > max_iter)) + (not np.any(coef_change > tol)) or (n_iter >= max_iter)) if not converged: Xbeta = dot(X, beta) # numpy -> dask converstion of beta + n_iter += 1 return beta, n_iter @@ -220,6 +221,7 @@ def admm(X, y, regularizer='l1', lamduh=0.1, rho=1, over_relax=1, Returns ------- beta : array-like, shape (n_features,) + n_iter : number of iterations executed """ pointwise_loss = family.pointwise_loss pointwise_gradient = family.pointwise_gradient @@ -333,6 +335,7 @@ def lbfgs(X, y, regularizer=None, lamduh=1.0, max_iter=100, tol=1e-4, Returns ------- beta : array-like, shape (n_features,) + n_iter : number of iterations executed """ pointwise_loss = family.pointwise_loss pointwise_gradient = family.pointwise_gradient @@ -382,6 +385,7 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic, Returns ------- beta : array-like, shape (n_features,) + n_iter : number of iterations executed """ n, p = X.shape firstBacktrackMult = 0.1 diff --git a/dask_glm/tests/test_admm.py b/dask_glm/tests/test_admm.py index 7b0373a..4f49b89 100644 --- a/dask_glm/tests/test_admm.py +++ b/dask_glm/tests/test_admm.py @@ -47,11 +47,13 @@ def wrapped(beta, X, y, z, u, rho): @pytest.mark.parametrize('nchunks', [5, 10]) @pytest.mark.parametrize('p', [1, 5, 10]) def test_admm_with_large_lamduh(N, p, nchunks): + max_iter = 500 X = da.random.random((N, p), chunks=(N // nchunks, p)) beta = np.random.random(p) y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,)) X, y = persist(X, y) - z = admm(X, y, regularizer=L1(), lamduh=1e5, rho=20, max_iter=500) + z, n_iter = admm(X, y, regularizer=L1(), lamduh=1e5, rho=20, max_iter=max_iter) assert np.allclose(z, np.zeros(p), atol=1e-4) + assert n_iter > 0 and n_iter <= max_iter diff --git a/dask_glm/tests/test_algos_families.py b/dask_glm/tests/test_algos_families.py index 3dedc38..363c2a4 100644 --- a/dask_glm/tests/test_algos_families.py +++ b/dask_glm/tests/test_algos_families.py @@ -48,7 +48,7 @@ def make_intercept_data(N, p, seed=20009): (95, 6, 70605)]) def test_methods(N, p, seed, opt): X, y = make_intercept_data(N, p, seed=seed) - coefs = opt(X, y) + coefs, _ = opt(X, y) p = sigmoid(X.dot(coefs).compute()) y_sum = y.compute().sum() @@ -57,9 +57,9 @@ def test_methods(N, p, seed, opt): @pytest.mark.parametrize('func,kwargs', [ - (newton, {'tol': 1e-5}), - (lbfgs, {'tol': 1e-8}), - (gradient_descent, {'tol': 1e-7}), + (newton, {'tol': 1e-5, 'max_iter': 50}), + (lbfgs, {'tol': 1e-8, 'max_iter': 100}), + (gradient_descent, {'tol': 1e-7, 'max_iter': 100}), ]) @pytest.mark.parametrize('N', [1000]) @pytest.mark.parametrize('nchunks', [1, 10]) @@ -72,18 +72,20 @@ def test_basic_unreg_descent(func, kwargs, N, nchunks, family): X, y = persist(X, y) - result = func(X, y, family=family, **kwargs) + result, n_iter = func(X, y, family=family, **kwargs) test_vec = np.random.normal(size=2) opt = family.pointwise_loss(result, X, y).compute() test_val = family.pointwise_loss(test_vec, X, y).compute() + max_iter = kwargs['max_iter'] + assert n_iter > 0 and n_iter <= max_iter assert opt < test_val @pytest.mark.parametrize('func,kwargs', [ - (admm, {'abstol': 1e-4}), - (proximal_grad, {'tol': 1e-7}), + (admm, {'abstol': 1e-4, 'max_iter': 250}), + (proximal_grad, {'tol': 1e-7, 'max_iter': 100}), ]) @pytest.mark.parametrize('N', [1000]) @pytest.mark.parametrize('nchunks', [1, 10]) @@ -98,7 +100,7 @@ def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg): X, y = persist(X, y) - result = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs) + result, n_iter = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs) test_vec = np.random.normal(size=2) f = reg.add_reg_f(family.pointwise_loss, lam) @@ -106,6 +108,8 @@ def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg): opt = f(result, X, y).compute() test_val = f(test_vec, X, y).compute() + max_iter = kwargs['max_iter'] + assert n_iter > 0 and n_iter <= max_iter assert opt < test_val @@ -124,9 +128,12 @@ def test_determinism(func, kwargs, scheduler): X, y = make_intercept_data(1000, 10) with dask.config.set(scheduler=scheduler): - a = func(X, y, **kwargs) - b = func(X, y, **kwargs) + a, n_iter_a = func(X, y, **kwargs) + b, n_iter_b = func(X, y, **kwargs) + max_iter = kwargs['max_iter'] + assert n_iter_a > 0 and n_iter_a <= max_iter + assert n_iter_b > 0 and n_iter_b <= max_iter assert (a == b).all() @@ -147,7 +154,10 @@ def test_determinism_distributed(func, kwargs, loop): with Client(s['address'], loop=loop) as c: X, y = make_intercept_data(1000, 10) - a = func(X, y, **kwargs) - b = func(X, y, **kwargs) + a, n_iter_a = func(X, y, **kwargs) + b, n_iter_b = func(X, y, **kwargs) + max_iter = kwargs['max_iter'] + assert n_iter_a > 0 and n_iter_a <= max_iter + assert n_iter_b > 0 and n_iter_b <= max_iter assert (a == b).all() diff --git a/dask_glm/tests/test_utils.py b/dask_glm/tests/test_utils.py index 813b2fd..28b41ef 100644 --- a/dask_glm/tests/test_utils.py +++ b/dask_glm/tests/test_utils.py @@ -9,41 +9,41 @@ def test_normalize_normalizes(): @utils.normalize def do_nothing(X, y): - return np.array([0.0, 1.0, 2.0]) + return np.array([0.0, 1.0, 2.0]), None X = da.from_array(np.array([[1, 0, 0], [1, 2, 2]]), chunks=(2, 3)) y = da.from_array(np.array([0, 1, 0]), chunks=(3, )) - res = do_nothing(X, y) + res, n_iter = do_nothing(X, y) np.testing.assert_equal(res, np.array([-3.0, 1.0, 2.0])) def test_normalize_doesnt_normalize(): @utils.normalize def do_nothing(X, y): - return np.array([0.0, 1.0, 2.0]) + return np.array([0.0, 1.0, 2.0]), None X = da.from_array(np.array([[1, 0, 0], [1, 2, 2]]), chunks=(2, 3)) y = da.from_array(np.array([0, 1, 0]), chunks=(3, )) - res = do_nothing(X, y, normalize=False) + res, n_iter = do_nothing(X, y, normalize=False) np.testing.assert_equal(res, np.array([0, 1, 2])) def test_normalize_normalizes_if_intercept_not_present(): @utils.normalize def do_nothing(X, y): - return np.array([0.0, 1.0, 2.0]) + return np.array([0.0, 1.0, 2.0]), None X = da.from_array(np.array([[1, 0, 0], [3, 9.0, 2]]), chunks=(2, 3)) y = da.from_array(np.array([0, 1, 0]), chunks=(3, )) - res = do_nothing(X, y) + res, n_iter = do_nothing(X, y) np.testing.assert_equal(res, np.array([0, 1 / 4.5, 2])) def test_normalize_raises_if_multiple_constants(): @utils.normalize def do_nothing(X, y): - return np.array([0.0, 1.0, 2.0]) + return np.array([0.0, 1.0, 2.0]), None X = da.from_array(np.array([[1, 2, 3], [1, 2, 3]]), chunks=(2, 3)) y = da.from_array(np.array([0, 1, 0]), chunks=(3, )) with pytest.raises(ValueError): - res = do_nothing(X, y) + res, n_iter = do_nothing(X, y) def test_add_intercept(): From 1664bee1e2adf5f3d21f9defdecd2b86bb20f49d Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 20 Mar 2019 18:59:06 +0100 Subject: [PATCH 3/5] Fix flake8 error on Python 2.7 --- dask_glm/tests/test_algos_families.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_glm/tests/test_algos_families.py b/dask_glm/tests/test_algos_families.py index 363c2a4..71c8698 100644 --- a/dask_glm/tests/test_algos_families.py +++ b/dask_glm/tests/test_algos_families.py @@ -139,7 +139,7 @@ def test_determinism(func, kwargs, scheduler): try: from distributed import Client - from distributed.utils_test import cluster, loop # flake8: noqa + from distributed.utils_test import cluster, loop # noqa except ImportError: pass else: From 34766b23bfec90a51e7afeaeddfce6d9f096957c Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Fri, 22 Mar 2019 22:15:31 +0100 Subject: [PATCH 4/5] Add missing n_iter_ estimators docstring --- dask_glm/estimators.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dask_glm/estimators.py b/dask_glm/estimators.py index 4020c60..d07a009 100644 --- a/dask_glm/estimators.py +++ b/dask_glm/estimators.py @@ -106,6 +106,8 @@ class LogisticRegression(_GLM): ---------- coef_ : array, shape (n_classes, n_features) The learned value for the model's coefficients + n_iter_ : integer + The number of iterations executed intercept_ : float of None The learned value for the intercept, if one was added to the model @@ -160,6 +162,8 @@ class LinearRegression(_GLM): ---------- coef_ : array, shape (n_classes, n_features) The learned value for the model's coefficients + n_iter_ : integer + The number of iterations executed intercept_ : float of None The learned value for the intercept, if one was added to the model @@ -210,6 +214,8 @@ class PoissonRegression(_GLM): ---------- coef_ : array, shape (n_classes, n_features) The learned value for the model's coefficients + n_iter_ : integer + The number of iterations executed intercept_ : float of None The learned value for the intercept, if one was added to the model From 83af95fd23c2308a7de4ed94c1e3eaa76be00cdc Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Mon, 25 Mar 2019 21:38:12 +0100 Subject: [PATCH 5/5] Fix newton number of interations computation --- dask_glm/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index efea118..f5916d7 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -188,12 +188,12 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): # should change this criterion coef_change = np.absolute(beta_old - beta) + n_iter += 1 converged = ( (not np.any(coef_change > tol)) or (n_iter >= max_iter)) if not converged: Xbeta = dot(X, beta) # numpy -> dask converstion of beta - n_iter += 1 return beta, n_iter