From ca340177a567be997efc587aeaed277bccb2aa54 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 24 Jan 2025 08:49:40 +0100 Subject: [PATCH 01/14] Add comment to stat_mat.standardize() --- src/tabmat/standardized_mat.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index e53d8100..930279c3 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -130,7 +130,15 @@ def sandwich( if not hasattr(d, "dtype"): d = np.asarray(d) check_sandwich_compatible(self, d) - + # stat_mat = mat * mult[newaxis, :] + shift[newaxis, :] + # stat_mat.T @ d[:, newaxis] * stat_mat + # = mult[:, newaxis] * mat.T @ d[:, newaxis] * mat * mult[newaxis, :] + (1) + # mult[:, newaxis] * mat.T @ d[:, newaxis] * np.outer(ones, shift) + (2) + # shift[:, newaxis] @ d[:, newaxis] * mat * mult[newaxis, :] + (3) + # shift[:, newaxis] @ d[:, newaxis] * shift[newaxis, :] (4) + # + # (1) = self.mat.sandwich(d) * np.outer(limited_mult, limited_mult) + # (2) = mult * self.transpose_matvec(d) * shift[newaxis, :] if rows is not None or cols is not None: setup_rows, setup_cols = setup_restrictions(self.shape, rows, cols) if rows is not None: From 65ab55d433db6c56bf497de1d9c85ff997fdd9fc Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 24 Jan 2025 08:51:01 +0100 Subject: [PATCH 02/14] More conservative approach to compute standard deviations. --- src/tabmat/dense_matrix.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index c6738c80..b626d2a8 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -164,8 +164,8 @@ def _cross_sandwich( raise TypeError def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray: - """Get standard deviations of columns.""" - sqrt_arg = transpose_square_dot_weights(self._array, weights) - col_means**2 + """Get standard deviations of columns using weights `weights`.""" + sqrt_arg = transpose_square_dot_weights(self._array - col_means[np.newaxis, :], weights) # Minor floating point errors above can result in a very slightly # negative sqrt_arg (e.g. -5e-16). We just set those values equal to # zero. From ed0da75f0a71d85b1db14a427352cf530a5e8570 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 24 Jan 2025 08:52:25 +0100 Subject: [PATCH 03/14] Format --- src/tabmat/dense_matrix.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index b626d2a8..ae19d74e 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -165,7 +165,9 @@ def _cross_sandwich( def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray: """Get standard deviations of columns using weights `weights`.""" - sqrt_arg = transpose_square_dot_weights(self._array - col_means[np.newaxis, :], weights) + sqrt_arg = transpose_square_dot_weights( + self._array - col_means[np.newaxis, :], weights + ) # Minor floating point errors above can result in a very slightly # negative sqrt_arg (e.g. -5e-16). We just set those values equal to # zero. From 593090874e6de1703b2c108e80bd0042605ce18a Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 24 Jan 2025 08:58:06 +0100 Subject: [PATCH 04/14] Update changelog. --- CHANGELOG.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 59cc9c51..5dbbf8db 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,10 @@ Changelog Unreleased ---------- +**Bug fix:** + +- A more robust :meth:`DenseMatrix._get_col_stds` results in more accurate :meth:`StandardizedMatrix.sandwich` results. + **Other changes:** - Build wheel for pypi on python 3.13. From f9fa1a5e083041c08072ab89f36701db87bf0427 Mon Sep 17 00:00:00 2001 From: Jan Tilly Date: Tue, 19 Nov 2024 10:47:43 +0100 Subject: [PATCH 05/14] Add example to CI. --- .github/workflows/ci.yml | 2 ++ example.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 example.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0492986..4481f01c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,6 +48,8 @@ jobs: run: pixi run -e ${{ matrix.environment }} install-nightlies - name: Install repository run: pixi run -e ${{ matrix.environment }} postinstall + - name: Run Malte's example + run: pixi run -e ${{ matrix.environment }} python example.py - name: Run pytest run: pixi run -e ${{ matrix.environment }} test -nauto -m "not high_memory" - name: Run doctest diff --git a/example.py b/example.py new file mode 100644 index 00000000..4f3e5e73 --- /dev/null +++ b/example.py @@ -0,0 +1,37 @@ +import numpy as np + +import tabmat + +np.set_printoptions(suppress=True) + +for dtype in [np.float32, np.float64]: + X = np.array( + [ + [46.231056, 126.05263, 144.46439], + [46.231224, 128.66818, 0.7667693], + [46.231186, 104.97506, 193.8872], + [46.230835, 130.10156, 143.88954], + [46.230896, 116.76007, 7.5629334], + ], + dtype=dtype, + ) + v = np.array( + [0.12428328, 0.67062443, 0.6471895, 0.6153851, 0.38367754], dtype=dtype + ) + + weights = np.full(X.shape[0], 1 / X.shape[0], dtype=dtype) + + stmat, out_means, col_stds = tabmat.DenseMatrix(X).standardize(weights, True, True) + + print(stmat.toarray().T @ v) + print(stmat.transpose_matvec(v)) + + # compute by hand + res = np.zeros(X.shape[1], dtype=dtype) + for col in range(X.shape[1]): + res[col] += ( + stmat.shift[col] + stmat.mult[col] * stmat.mat.toarray()[:, col] + ) @ v + + print(res) + print("\n") From 6d0a3acb1b5e5cd4d635315582de1d23fc01268a Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 24 Jan 2025 09:23:04 +0100 Subject: [PATCH 06/14] try something --- src/tabmat/dense_matrix.py | 4 +--- src/tabmat/ext/dense.pyx | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index ae19d74e..3502251f 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -165,9 +165,7 @@ def _cross_sandwich( def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray: """Get standard deviations of columns using weights `weights`.""" - sqrt_arg = transpose_square_dot_weights( - self._array - col_means[np.newaxis, :], weights - ) + sqrt_arg = transpose_square_dot_weights(self._array, weights, col_means) # Minor floating point errors above can result in a very slightly # negative sqrt_arg (e.g. -5e-16). We just set those values equal to # zero. diff --git a/src/tabmat/ext/dense.pyx b/src/tabmat/ext/dense.pyx index 32a3ac49..0538df5b 100644 --- a/src/tabmat/ext/dense.pyx +++ b/src/tabmat/ext/dense.pyx @@ -100,7 +100,7 @@ def dense_matvec(np.ndarray X, floating[:] v, int[:] rows, int[:] cols): raise Exception("The matrix X is not contiguous.") return out -def transpose_square_dot_weights(np.ndarray X, floating[:] weights): +def transpose_square_dot_weights(np.ndarray X, floating[:] weights, floating[:] shift): cdef floating* Xp = X.data cdef int nrows = weights.shape[0] cdef int ncols = X.shape[1] @@ -112,11 +112,11 @@ def transpose_square_dot_weights(np.ndarray X, floating[:] weights): if X.flags["C_CONTIGUOUS"]: for j in prange(ncols, nogil=True): for i in range(nrows): - outp[j] = outp[j] + weights[i] * (Xp[i * ncols + j] ** 2) + outp[j] = outp[j] + weights[i] * ((Xp[i * ncols + j] - shift[j]) ** 2) elif X.flags["F_CONTIGUOUS"]: for j in prange(ncols, nogil=True): for i in range(nrows): - outp[j] = outp[j] + weights[i] * (Xp[j * nrows + i] ** 2) + outp[j] = outp[j] + weights[i] * ((Xp[j * nrows + i] - shift[j]) ** 2) else: raise Exception("The matrix X is not contiguous.") return out From e26d59f4fcdce6dc9fe7c2dde933f4ad4d3b69ab Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 24 Jan 2025 11:10:02 +0100 Subject: [PATCH 07/14] Run ci on pushes on prs --- .github/workflows/ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4481f01c..3cfef9e8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,10 @@ name: CI -on: [push] + +on: + pull_request: + branches: + - main + push: jobs: pre-commit-checks: From 124c900b35febde7b234b4c82ca773a42b06970f Mon Sep 17 00:00:00 2001 From: Malte Londschien <61679398+mlondschien@users.noreply.github.com> Date: Fri, 24 Jan 2025 15:45:27 +0100 Subject: [PATCH 08/14] Update .github/workflows/ci.yml Co-authored-by: Martin Stancsics --- .github/workflows/ci.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3cfef9e8..7291eee1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,10 +1,16 @@ name: CI on: + # We would like to trigger for CI for any pull request action - + # both from QuantCo's branches as well as forks. pull_request: - branches: - - main + # In addition to pull requests, we want to run CI for pushes + # to the main branch and tags. push: + branches: + - "main" + tags: + - "*" jobs: pre-commit-checks: From c02fbc2063e2be25ba76e8a94633739fc31908cb Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 29 Jan 2025 09:43:23 +0100 Subject: [PATCH 09/14] Add test. --- tests/test_matrices.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index a72985e5..dd8061f6 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -813,3 +813,36 @@ def test_combine_names(mat_1, mat_2): assert combined.column_names == mat_1.column_names + mat_2.column_names assert combined.term_names == mat_1.term_names + mat_2.term_names + + +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_standardize_constant_cols(dtype): + # https://github.com/Quantco/tabmat/issues/414 + X = np.array( + [ + [46.231056, 126.05263, 144.46439], + [46.231224, 128.66818, 0.7667693], + [46.231186, 104.97506, 193.8872], + [46.230835, 130.10156, 143.88954], + [46.230896, 116.76007, 7.5629334], + ], + dtype=dtype, + ) + v = np.array( + [0.12428328, 0.67062443, 0.6471895, 0.6153851, 0.38367754], dtype=dtype + ) + weights = np.full(X.shape[0], 1 / X.shape[0], dtype=dtype) + + standardized_mat, _, _ = tm.DenseMatrix(X).standardize( + weights, center_predictors=True, scale_predictors=True + ) + + result = standardized_mat.transpose_matvec(v) + expected = standardized_mat.toarray().T @ v + + np.testing.assert_allclose(result, expected) + + result = standardized_mat.sandwich(v) + expected = (standardized_mat.toarray().T * v[None, :]) @ standardized_mat.toarray() + + np.testing.assert_allclose(result, expected) From 3fcc01a05ac63962861025284bd7ab26f7a2909b Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 29 Jan 2025 09:52:46 +0100 Subject: [PATCH 10/14] Test the actual problem. --- tests/test_matrices.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index dd8061f6..02a93040 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -816,7 +816,7 @@ def test_combine_names(mat_1, mat_2): @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_standardize_constant_cols(dtype): +def test_dense_matrix_get_col_stds(dtype): # https://github.com/Quantco/tabmat/issues/414 X = np.array( [ @@ -828,21 +828,11 @@ def test_standardize_constant_cols(dtype): ], dtype=dtype, ) - v = np.array( - [0.12428328, 0.67062443, 0.6471895, 0.6153851, 0.38367754], dtype=dtype - ) weights = np.full(X.shape[0], 1 / X.shape[0], dtype=dtype) - standardized_mat, _, _ = tm.DenseMatrix(X).standardize( + standardized_mat, _, col_stds = tm.DenseMatrix(X).standardize( weights, center_predictors=True, scale_predictors=True ) - result = standardized_mat.transpose_matvec(v) - expected = standardized_mat.toarray().T @ v - - np.testing.assert_allclose(result, expected) - - result = standardized_mat.sandwich(v) - expected = (standardized_mat.toarray().T * v[None, :]) @ standardized_mat.toarray() - - np.testing.assert_allclose(result, expected) + np.testing.assert_allclose(col_stds, np.std(X, axis=0, ddof=0)) + np.testing.assert_allclose(standardized_mat.mult, 1 / np.std(X, axis=0, ddof=0)) \ No newline at end of file From 812f01f6085655cb8c5030eb6bdd8b6be6c55c98 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 29 Jan 2025 09:59:11 +0100 Subject: [PATCH 11/14] Format --- tests/test_matrices.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 02a93040..ec33af78 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -835,4 +835,4 @@ def test_dense_matrix_get_col_stds(dtype): ) np.testing.assert_allclose(col_stds, np.std(X, axis=0, ddof=0)) - np.testing.assert_allclose(standardized_mat.mult, 1 / np.std(X, axis=0, ddof=0)) \ No newline at end of file + np.testing.assert_allclose(standardized_mat.mult, 1 / np.std(X, axis=0, ddof=0)) From 245192b167edd060fb14a97f8bcb086d8438a45d Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 29 Jan 2025 10:17:20 +0100 Subject: [PATCH 12/14] Update precision to sqrt(eps) --- tests/test_matrices.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index ec33af78..729fbe1c 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -834,5 +834,6 @@ def test_dense_matrix_get_col_stds(dtype): weights, center_predictors=True, scale_predictors=True ) - np.testing.assert_allclose(col_stds, np.std(X, axis=0, ddof=0)) - np.testing.assert_allclose(standardized_mat.mult, 1 / np.std(X, axis=0, ddof=0)) + eps = np.sqrt(np.finfo(dtype).eps) # sqrt since std = sqrt(var) + np.testing.assert_allclose(col_stds, np.std(X, axis=0, ddof=0), rtol=eps) + np.testing.assert_allclose(standardized_mat.mult, 1 / np.std(X, axis=0, ddof=0), rtol=eps) From 080a475ce3f18b58301afacbc91595337da00024 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 29 Jan 2025 10:17:39 +0100 Subject: [PATCH 13/14] Format --- tests/test_matrices.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 729fbe1c..a1c711a4 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -836,4 +836,6 @@ def test_dense_matrix_get_col_stds(dtype): eps = np.sqrt(np.finfo(dtype).eps) # sqrt since std = sqrt(var) np.testing.assert_allclose(col_stds, np.std(X, axis=0, ddof=0), rtol=eps) - np.testing.assert_allclose(standardized_mat.mult, 1 / np.std(X, axis=0, ddof=0), rtol=eps) + np.testing.assert_allclose( + standardized_mat.mult, 1 / np.std(X, axis=0, ddof=0), rtol=eps + ) From 1cda73815b903f165c5d5707c0e13cbd924a8da2 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Wed, 29 Jan 2025 10:24:30 +0100 Subject: [PATCH 14/14] Remove jan's example file. --- .github/workflows/ci.yml | 2 -- example.py | 37 ------------------------------------- 2 files changed, 39 deletions(-) delete mode 100644 example.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7291eee1..5859f6e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,8 +59,6 @@ jobs: run: pixi run -e ${{ matrix.environment }} install-nightlies - name: Install repository run: pixi run -e ${{ matrix.environment }} postinstall - - name: Run Malte's example - run: pixi run -e ${{ matrix.environment }} python example.py - name: Run pytest run: pixi run -e ${{ matrix.environment }} test -nauto -m "not high_memory" - name: Run doctest diff --git a/example.py b/example.py deleted file mode 100644 index 4f3e5e73..00000000 --- a/example.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np - -import tabmat - -np.set_printoptions(suppress=True) - -for dtype in [np.float32, np.float64]: - X = np.array( - [ - [46.231056, 126.05263, 144.46439], - [46.231224, 128.66818, 0.7667693], - [46.231186, 104.97506, 193.8872], - [46.230835, 130.10156, 143.88954], - [46.230896, 116.76007, 7.5629334], - ], - dtype=dtype, - ) - v = np.array( - [0.12428328, 0.67062443, 0.6471895, 0.6153851, 0.38367754], dtype=dtype - ) - - weights = np.full(X.shape[0], 1 / X.shape[0], dtype=dtype) - - stmat, out_means, col_stds = tabmat.DenseMatrix(X).standardize(weights, True, True) - - print(stmat.toarray().T @ v) - print(stmat.transpose_matvec(v)) - - # compute by hand - res = np.zeros(X.shape[1], dtype=dtype) - for col in range(X.shape[1]): - res[col] += ( - stmat.shift[col] + stmat.mult[col] * stmat.mat.toarray()[:, col] - ) @ v - - print(res) - print("\n")