diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py
index 63d7729..f5916d7 100644
--- a/dask_glm/algorithms.py
+++ b/dask_glm/algorithms.py
@@ -87,6 +87,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
     Returns
     -------
     beta : array-like, shape (n_features,)
+    n_iter : number of iterations executed
     """
     loglike, gradient = family.loglike, family.gradient
     n, p = X.shape
@@ -98,6 +99,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
     recalcRate = 10
     backtrackMult = firstBacktrackMult
     beta = np.zeros(p)
+    n_iter = 0
 
     for k in range(max_iter):
         # how necessary is this recalculation?
@@ -125,6 +127,8 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
         beta = beta - stepSize * grad  # tiny bit of repeat work here to avoid communication
         Xbeta = Xbeta - stepSize * Xgradient
 
+        n_iter += 1
+
         if stepSize == 0:
             break
 
@@ -136,7 +140,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
         stepSize *= stepGrowth
         backtrackMult = nextBacktrackMult
 
-    return beta
+    return beta, n_iter
 
 
 @normalize
@@ -158,13 +162,14 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
     Returns
     -------
     beta : array-like, shape (n_features,)
+    n_iter : number of iterations executed
     """
     gradient, hessian = family.gradient, family.hessian
     n, p = X.shape
     beta = np.zeros(p)  # always init to zeros?
     Xbeta = dot(X, beta)
 
-    iter_count = 0
+    n_iter = 0
     converged = False
 
     while not converged:
@@ -181,17 +186,16 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
         step, _, _, _ = np.linalg.lstsq(hess, grad)
         beta = (beta_old - step)
 
-        iter_count += 1
-
         # should change this criterion
         coef_change = np.absolute(beta_old - beta)
+        n_iter += 1
         converged = (
-            (not np.any(coef_change > tol)) or (iter_count > max_iter))
+            (not np.any(coef_change > tol)) or (n_iter >= max_iter))
 
         if not converged:
             Xbeta = dot(X, beta)  # numpy -> dask converstion of beta
 
-    return beta
+    return beta, n_iter
 
 
 @normalize
@@ -217,6 +221,7 @@ def admm(X, y, regularizer='l1', lamduh=0.1, rho=1, over_relax=1,
     Returns
     -------
     beta : array-like, shape (n_features,)
+    n_iter : number of iterations executed
     """
     pointwise_loss = family.pointwise_loss
     pointwise_gradient = family.pointwise_gradient
@@ -256,6 +261,8 @@ def wrapped(beta, X, y, z, u, rho):
     u = np.array([np.zeros(p) for i in range(nchunks)])
     betas = np.array([np.ones(p) for i in range(nchunks)])
 
+    n_iter = 0
+
     for k in range(max_iter):
 
         # x-update step
@@ -283,10 +290,12 @@ def wrapped(beta, X, y, z, u, rho):
         eps_dual = np.sqrt(p * nchunks) * abstol + \
             reltol * np.linalg.norm(rho * u)
 
+        n_iter += 1
+
         if primal_res < eps_pri and dual_res < eps_dual:
             break
 
-    return z
+    return z, n_iter
 
 
 def local_update(X, y, beta, z, u, rho, f, fprime, solver=fmin_l_bfgs_b):
@@ -326,6 +335,7 @@ def lbfgs(X, y, regularizer=None, lamduh=1.0, max_iter=100, tol=1e-4,
     Returns
     -------
     beta : array-like, shape (n_features,)
+    n_iter : number of iterations executed
     """
     pointwise_loss = family.pointwise_loss
     pointwise_gradient = family.pointwise_gradient
@@ -349,7 +359,7 @@ def compute_loss_grad(beta, X, y):
             args=(X, y),
             iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter)
 
-    return beta
+    return beta, info['nit']
 
 
 @normalize
@@ -375,6 +385,7 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
     Returns
     -------
     beta : array-like, shape (n_features,)
+    n_iter : number of iterations executed
     """
     n, p = X.shape
     firstBacktrackMult = 0.1
@@ -387,6 +398,8 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
     beta = np.zeros(p)
     regularizer = Regularizer.get(regularizer)
 
+    n_iter = 0
+
     for k in range(max_iter):
         # Compute the gradient
         if k % recalcRate == 0:
@@ -400,6 +413,8 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
 
         obeta = beta
 
+        n_iter += 1
+
         # Compute the step size
         lf = func
         for ii in range(100):
@@ -426,9 +441,9 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
 
     # L2-regularization returned a dask-array
     try:
-        return beta.compute()
+        return beta.compute(), n_iter
     except AttributeError:
-        return beta
+        return beta, n_iter
 
 
 _solvers = {
diff --git a/dask_glm/estimators.py b/dask_glm/estimators.py
index 435f7e6..d07a009 100644
--- a/dask_glm/estimators.py
+++ b/dask_glm/estimators.py
@@ -63,7 +63,7 @@ def __init__(self, fit_intercept=True, solver='admm', regularizer='l2',
 
     def fit(self, X, y=None):
         X_ = self._maybe_add_intercept(X)
-        self._coef = algorithms._solvers[self.solver](X_, y, **self._fit_kwargs)
+        self._coef, self.n_iter_ = algorithms._solvers[self.solver](X_, y, **self._fit_kwargs)
 
         if self.fit_intercept:
             self.coef_ = self._coef[:-1]
@@ -106,6 +106,8 @@ class LogisticRegression(_GLM):
     ----------
     coef_ : array, shape (n_classes, n_features)
         The learned value for the model's coefficients
+    n_iter_ : integer
+        The number of iterations executed
     intercept_ : float of None
         The learned value for the intercept, if one was added
         to the model
@@ -160,6 +162,8 @@ class LinearRegression(_GLM):
     ----------
     coef_ : array, shape (n_classes, n_features)
         The learned value for the model's coefficients
+    n_iter_ : integer
+        The number of iterations executed
     intercept_ : float of None
         The learned value for the intercept, if one was added
         to the model
@@ -210,6 +214,8 @@ class PoissonRegression(_GLM):
     ----------
     coef_ : array, shape (n_classes, n_features)
         The learned value for the model's coefficients
+    n_iter_ : integer
+        The number of iterations executed
     intercept_ : float of None
         The learned value for the intercept, if one was added
         to the model
diff --git a/dask_glm/tests/test_admm.py b/dask_glm/tests/test_admm.py
index 7b0373a..4f49b89 100644
--- a/dask_glm/tests/test_admm.py
+++ b/dask_glm/tests/test_admm.py
@@ -47,11 +47,13 @@ def wrapped(beta, X, y, z, u, rho):
 @pytest.mark.parametrize('nchunks', [5, 10])
 @pytest.mark.parametrize('p', [1, 5, 10])
 def test_admm_with_large_lamduh(N, p, nchunks):
+    max_iter = 500
     X = da.random.random((N, p), chunks=(N // nchunks, p))
     beta = np.random.random(p)
     y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,))
 
     X, y = persist(X, y)
-    z = admm(X, y, regularizer=L1(), lamduh=1e5, rho=20, max_iter=500)
+    z, n_iter = admm(X, y, regularizer=L1(), lamduh=1e5, rho=20, max_iter=max_iter)
 
     assert np.allclose(z, np.zeros(p), atol=1e-4)
+    assert n_iter > 0 and n_iter <= max_iter
diff --git a/dask_glm/tests/test_algos_families.py b/dask_glm/tests/test_algos_families.py
index 3dedc38..71c8698 100644
--- a/dask_glm/tests/test_algos_families.py
+++ b/dask_glm/tests/test_algos_families.py
@@ -48,7 +48,7 @@ def make_intercept_data(N, p, seed=20009):
                           (95, 6, 70605)])
 def test_methods(N, p, seed, opt):
     X, y = make_intercept_data(N, p, seed=seed)
-    coefs = opt(X, y)
+    coefs, _ = opt(X, y)
     p = sigmoid(X.dot(coefs).compute())
 
     y_sum = y.compute().sum()
@@ -57,9 +57,9 @@ def test_methods(N, p, seed, opt):
 
 
 @pytest.mark.parametrize('func,kwargs', [
-    (newton, {'tol': 1e-5}),
-    (lbfgs, {'tol': 1e-8}),
-    (gradient_descent, {'tol': 1e-7}),
+    (newton, {'tol': 1e-5, 'max_iter': 50}),
+    (lbfgs, {'tol': 1e-8, 'max_iter': 100}),
+    (gradient_descent, {'tol': 1e-7, 'max_iter': 100}),
 ])
 @pytest.mark.parametrize('N', [1000])
 @pytest.mark.parametrize('nchunks', [1, 10])
@@ -72,18 +72,20 @@ def test_basic_unreg_descent(func, kwargs, N, nchunks, family):
 
     X, y = persist(X, y)
 
-    result = func(X, y, family=family, **kwargs)
+    result, n_iter = func(X, y, family=family, **kwargs)
     test_vec = np.random.normal(size=2)
 
     opt = family.pointwise_loss(result, X, y).compute()
     test_val = family.pointwise_loss(test_vec, X, y).compute()
 
+    max_iter = kwargs['max_iter']
+    assert n_iter > 0 and n_iter <= max_iter
     assert opt < test_val
 
 
 @pytest.mark.parametrize('func,kwargs', [
-    (admm, {'abstol': 1e-4}),
-    (proximal_grad, {'tol': 1e-7}),
+    (admm, {'abstol': 1e-4, 'max_iter': 250}),
+    (proximal_grad, {'tol': 1e-7, 'max_iter': 100}),
 ])
 @pytest.mark.parametrize('N', [1000])
 @pytest.mark.parametrize('nchunks', [1, 10])
@@ -98,7 +100,7 @@ def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg):
 
     X, y = persist(X, y)
 
-    result = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs)
+    result, n_iter = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs)
     test_vec = np.random.normal(size=2)
 
     f = reg.add_reg_f(family.pointwise_loss, lam)
@@ -106,6 +108,8 @@ def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg):
     opt = f(result, X, y).compute()
     test_val = f(test_vec, X, y).compute()
 
+    max_iter = kwargs['max_iter']
+    assert n_iter > 0 and n_iter <= max_iter
     assert opt < test_val
 
 
@@ -124,15 +128,18 @@ def test_determinism(func, kwargs, scheduler):
     X, y = make_intercept_data(1000, 10)
 
     with dask.config.set(scheduler=scheduler):
-        a = func(X, y, **kwargs)
-        b = func(X, y, **kwargs)
+        a, n_iter_a = func(X, y, **kwargs)
+        b, n_iter_b = func(X, y, **kwargs)
 
+    max_iter = kwargs['max_iter']
+    assert n_iter_a > 0 and n_iter_a <= max_iter
+    assert n_iter_b > 0 and n_iter_b <= max_iter
     assert (a == b).all()
 
 
 try:
     from distributed import Client
-    from distributed.utils_test import cluster, loop  # flake8: noqa
+    from distributed.utils_test import cluster, loop  # noqa
 except ImportError:
     pass
 else:
@@ -147,7 +154,10 @@ def test_determinism_distributed(func, kwargs, loop):
             with Client(s['address'], loop=loop) as c:
                 X, y = make_intercept_data(1000, 10)
 
-                a = func(X, y, **kwargs)
-                b = func(X, y, **kwargs)
+                a, n_iter_a = func(X, y, **kwargs)
+                b, n_iter_b = func(X, y, **kwargs)
 
+                max_iter = kwargs['max_iter']
+                assert n_iter_a > 0 and n_iter_a <= max_iter
+                assert n_iter_b > 0 and n_iter_b <= max_iter
                 assert (a == b).all()
diff --git a/dask_glm/tests/test_utils.py b/dask_glm/tests/test_utils.py
index 813b2fd..28b41ef 100644
--- a/dask_glm/tests/test_utils.py
+++ b/dask_glm/tests/test_utils.py
@@ -9,41 +9,41 @@
 def test_normalize_normalizes():
     @utils.normalize
     def do_nothing(X, y):
-        return np.array([0.0, 1.0, 2.0])
+        return np.array([0.0, 1.0, 2.0]), None
     X = da.from_array(np.array([[1, 0, 0], [1, 2, 2]]), chunks=(2, 3))
     y = da.from_array(np.array([0, 1, 0]), chunks=(3, ))
-    res = do_nothing(X, y)
+    res, n_iter = do_nothing(X, y)
     np.testing.assert_equal(res, np.array([-3.0, 1.0, 2.0]))
 
 
 def test_normalize_doesnt_normalize():
     @utils.normalize
     def do_nothing(X, y):
-        return np.array([0.0, 1.0, 2.0])
+        return np.array([0.0, 1.0, 2.0]), None
     X = da.from_array(np.array([[1, 0, 0], [1, 2, 2]]), chunks=(2, 3))
     y = da.from_array(np.array([0, 1, 0]), chunks=(3, ))
-    res = do_nothing(X, y, normalize=False)
+    res, n_iter = do_nothing(X, y, normalize=False)
     np.testing.assert_equal(res, np.array([0, 1, 2]))
 
 
 def test_normalize_normalizes_if_intercept_not_present():
     @utils.normalize
     def do_nothing(X, y):
-        return np.array([0.0, 1.0, 2.0])
+        return np.array([0.0, 1.0, 2.0]), None
     X = da.from_array(np.array([[1, 0, 0], [3, 9.0, 2]]), chunks=(2, 3))
     y = da.from_array(np.array([0, 1, 0]), chunks=(3, ))
-    res = do_nothing(X, y)
+    res, n_iter = do_nothing(X, y)
     np.testing.assert_equal(res, np.array([0, 1 / 4.5, 2]))
 
 
 def test_normalize_raises_if_multiple_constants():
     @utils.normalize
     def do_nothing(X, y):
-        return np.array([0.0, 1.0, 2.0])
+        return np.array([0.0, 1.0, 2.0]), None
     X = da.from_array(np.array([[1, 2, 3], [1, 2, 3]]), chunks=(2, 3))
     y = da.from_array(np.array([0, 1, 0]), chunks=(3, ))
     with pytest.raises(ValueError):
-        res = do_nothing(X, y)
+        res, n_iter = do_nothing(X, y)
 
 
 def test_add_intercept():
diff --git a/dask_glm/utils.py b/dask_glm/utils.py
index fd91460..2fbeac8 100644
--- a/dask_glm/utils.py
+++ b/dask_glm/utils.py
@@ -23,10 +23,11 @@ def normalize_inputs(X, y, *args, **kwargs):
             std[intercept_idx] = 1
             mean = mean if len(intercept_idx[0]) else np.zeros(mean.shape)
             Xn = (X - mean) / std
-            out = algo(Xn, y, *args, **kwargs).copy()
+            out, n_iter = algo(Xn, y, *args, **kwargs)
+            out = out.copy()
             i_adj = np.sum(out * mean / std)
             out[intercept_idx] -= i_adj
-            return out / std
+            return out / std, n_iter
         else:
             return algo(X, y, *args, **kwargs)
     return normalize_inputs