|
1 | 1 | import sys
|
2 |
| -from pathlib import Path |
3 | 2 |
|
4 | 3 | import geopandas as gpd
|
5 | 4 | import numpy as np
|
6 | 5 | import pandas as pd
|
7 | 6 | import pytest
|
| 7 | +from beartype.typing import Optional |
8 | 8 | from shapely.geometry import Point
|
9 | 9 |
|
10 | 10 | from eis_toolkit.exceptions import EmptyDataException, InvalidColumnException, InvalidParameterValueException
|
11 | 11 | from eis_toolkit.exploratory_analyses.pca import compute_pca
|
12 | 12 |
|
13 |
| -parent_dir = Path(__file__).parent |
14 |
| -MULTIBAND_RASTER_PATH = parent_dir.joinpath("../data/remote/small_raster_multiband.tif") |
15 |
| - |
16 | 13 | DATA = np.array([[1, 1], [2, 2], [3, 3]])
|
| 14 | +EXPECTED_DATA_PCA_VALUES = expected_pca_values = np.array( |
| 15 | + [[-1.73205081, 1.11022302e-16], [0.0, 0.0], [1.73205081, 1.11022302e-16]] |
| 16 | +) |
| 17 | +EXPECTED_DATA_COMPONENT_VALUES = np.array([[0.70711, 0.70711], [0.70711, -0.70711]]) |
| 18 | +EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE = np.array([[0.70711, 0.70711], [-0.70711, 0.70711]]) |
| 19 | +EXPECTED_DATA_EXPLAINED_VARIANCE_RATIOS_VALUES = [1.0, 4.10865055e-33] |
| 20 | + |
| 21 | +DATA_DF = pd.DataFrame(data=DATA, columns=["A", "B"]) |
| 22 | +EXPECTED_DATA_DF_COLUMNS = ["principal_component_1", "principal_component_2"] |
| 23 | + |
| 24 | +DATA_GDF = gpd.GeoDataFrame( |
| 25 | + data=DATA, columns=["A", "B"], geometry=[Point(1, 2), Point(2, 1), Point(3, 3)], crs="EPSG:4326" |
| 26 | +) |
| 27 | +EXPECTED_DATA_GDF_COLUMNS = ["principal_component_1", "principal_component_2", "geometry"] |
| 28 | + |
| 29 | +DATA_WITH_NAN = np.array([[1, 1], [2, np.nan], [3, 3]]) |
| 30 | +DATA_WITH_NODATA = np.array([[1, 1], [2, -9999], [3, 3]]) |
| 31 | + |
| 32 | + |
| 33 | +def _assert_expected_values( |
| 34 | + pca_array: np.ndarray, |
| 35 | + principal_components, |
| 36 | + explained_variances, |
| 37 | + explained_variance_ratios, |
| 38 | + expected_pca_values=EXPECTED_DATA_PCA_VALUES, |
| 39 | + expected_component_values=EXPECTED_DATA_COMPONENT_VALUES, |
| 40 | + expected_component_values_alternative: Optional[np.ndarray] = None, |
| 41 | + expected_explained_variance_ratios_values=EXPECTED_DATA_EXPLAINED_VARIANCE_RATIOS_VALUES, |
| 42 | + decimal_accuracy: int = 5, |
| 43 | + data_shape=DATA.shape, |
| 44 | +): |
| 45 | + np.testing.assert_equal(principal_components.size, 4) |
| 46 | + np.testing.assert_equal(explained_variances.size, 2) |
| 47 | + np.testing.assert_equal(explained_variance_ratios.size, 2) |
| 48 | + np.testing.assert_equal(pca_array.shape, data_shape) |
| 49 | + np.testing.assert_array_almost_equal(pca_array, expected_pca_values, decimal=decimal_accuracy) |
| 50 | + |
| 51 | + try: |
| 52 | + np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=decimal_accuracy) |
| 53 | + except AssertionError: |
| 54 | + # Both variations in the sign of the two last members of principal_components occurs |
| 55 | + # depending on environment in nan and nodata tests |
| 56 | + # Both are allowed for those |
| 57 | + if expected_component_values_alternative is None: |
| 58 | + # Deviations in order are not expected unless *_alternative array is passed as input |
| 59 | + raise |
| 60 | + np.testing.assert_array_almost_equal( |
| 61 | + principal_components, expected_component_values_alternative, decimal=decimal_accuracy |
| 62 | + ) |
| 63 | + |
| 64 | + np.testing.assert_array_almost_equal( |
| 65 | + explained_variance_ratios, expected_explained_variance_ratios_values, decimal=decimal_accuracy |
| 66 | + ) |
17 | 67 |
|
18 | 68 |
|
19 | 69 | @pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
|
20 | 70 | def test_pca_numpy_array():
|
21 | 71 | """Test that PCA function gives correct output for Numpy array input."""
|
22 | 72 | pca_array, principal_components, explained_variances, explained_variance_ratios = compute_pca(DATA, 2)
|
23 | 73 |
|
24 |
| - expected_pca_array_values = np.array([[-1.73205081, 1.11022302e-16], [0.0, 0.0], [1.73205081, 1.11022302e-16]]) |
25 |
| - expected_component_values = np.array([[0.70711, 0.70711], [0.70711, -0.70711]]) |
26 |
| - expected_explained_variance_ratios_values = [1.0, 4.10865055e-33] |
27 |
| - |
28 |
| - np.testing.assert_equal(principal_components.size, 4) |
29 |
| - np.testing.assert_equal(explained_variances.size, 2) |
30 |
| - np.testing.assert_equal(explained_variance_ratios.size, 2) |
31 |
| - np.testing.assert_equal(pca_array.shape, DATA.shape) |
32 |
| - |
33 |
| - np.testing.assert_array_almost_equal(pca_array, expected_pca_array_values, decimal=5) |
34 |
| - np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=5) |
35 |
| - np.testing.assert_array_almost_equal( |
36 |
| - explained_variance_ratios, expected_explained_variance_ratios_values, decimal=5 |
| 74 | + _assert_expected_values( |
| 75 | + pca_array=pca_array, |
| 76 | + principal_components=principal_components, |
| 77 | + explained_variances=explained_variances, |
| 78 | + explained_variance_ratios=explained_variance_ratios, |
37 | 79 | )
|
38 | 80 |
|
39 | 81 |
|
40 | 82 | @pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
|
41 | 83 | def test_pca_df():
|
42 | 84 | """Test that PCA function gives correct output for DF input."""
|
43 |
| - data_df = pd.DataFrame(data=DATA, columns=["A", "B"]) |
44 | 85 |
|
45 |
| - pca_df, principal_components, explained_variances, explained_variance_ratios = compute_pca(data_df, 2) |
| 86 | + pca_df, principal_components, explained_variances, explained_variance_ratios = compute_pca(DATA_DF, 2) |
46 | 87 |
|
47 |
| - expected_columns = ["principal_component_1", "principal_component_2"] |
48 |
| - expected_pca_values = np.array([[-1.73205081, 1.11022302e-16], [0.0, 0.0], [1.73205081, 1.11022302e-16]]) |
49 |
| - expected_component_values = np.array([[0.70711, 0.70711], [0.70711, -0.70711]]) |
50 |
| - expected_explained_variance_ratios_values = [1.0, 4.10865055e-33] |
51 |
| - |
52 |
| - np.testing.assert_equal(principal_components.size, 4) |
53 |
| - np.testing.assert_equal(explained_variances.size, 2) |
54 |
| - np.testing.assert_equal(explained_variance_ratios.size, 2) |
55 |
| - np.testing.assert_equal(list(pca_df.columns), expected_columns) |
56 |
| - np.testing.assert_equal(pca_df.shape, data_df.shape) |
57 |
| - |
58 |
| - np.testing.assert_array_almost_equal(pca_df.values, expected_pca_values, decimal=5) |
59 |
| - np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=5) |
60 |
| - np.testing.assert_array_almost_equal( |
61 |
| - explained_variance_ratios, expected_explained_variance_ratios_values, decimal=5 |
| 88 | + _assert_expected_values( |
| 89 | + pca_array=pca_df.values, |
| 90 | + principal_components=principal_components, |
| 91 | + explained_variances=explained_variances, |
| 92 | + explained_variance_ratios=explained_variance_ratios, |
62 | 93 | )
|
| 94 | + np.testing.assert_equal(list(pca_df.columns), EXPECTED_DATA_DF_COLUMNS) |
| 95 | + np.testing.assert_equal(pca_df.shape, DATA_DF.shape) |
63 | 96 |
|
64 | 97 |
|
65 | 98 | @pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
|
66 | 99 | def test_pca_gdf():
|
67 | 100 | """Test that PCA function gives correct output for GDF input."""
|
68 |
| - data_gdf = gpd.GeoDataFrame( |
69 |
| - data=DATA, columns=["A", "B"], geometry=[Point(1, 2), Point(2, 1), Point(3, 3)], crs="EPSG:4326" |
70 |
| - ) |
71 |
| - |
72 |
| - pca_gdf, principal_components, explained_variances, explained_variance_ratios = compute_pca(data_gdf, 2) |
73 | 101 |
|
74 |
| - expected_columns = ["principal_component_1", "principal_component_2", "geometry"] |
75 |
| - expected_pca_values = np.array([[-1.73205081, 1.11022302e-16], [0.0, 0.0], [1.73205081, 1.11022302e-16]]) |
76 |
| - expected_component_values = np.array([[0.70711, 0.70711], [0.70711, -0.70711]]) |
77 |
| - expected_explained_variance_ratios_values = [1.0, 4.10865055e-33] |
| 102 | + pca_gdf, principal_components, explained_variances, explained_variance_ratios = compute_pca(DATA_GDF, 2) |
78 | 103 |
|
79 |
| - np.testing.assert_equal(principal_components.size, 4) |
80 |
| - np.testing.assert_equal(explained_variances.size, 2) |
81 |
| - np.testing.assert_equal(explained_variance_ratios.size, 2) |
82 |
| - np.testing.assert_equal(list(pca_gdf.columns), expected_columns) |
83 |
| - np.testing.assert_equal(pca_gdf.shape, data_gdf.shape) |
84 |
| - |
85 |
| - np.testing.assert_array_almost_equal(pca_gdf.drop(columns=["geometry"]).values, expected_pca_values, decimal=5) |
86 |
| - np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=5) |
87 |
| - np.testing.assert_array_almost_equal( |
88 |
| - explained_variance_ratios, expected_explained_variance_ratios_values, decimal=5 |
| 104 | + _assert_expected_values( |
| 105 | + pca_array=pca_gdf.drop(columns=["geometry"]).values, |
| 106 | + principal_components=principal_components, |
| 107 | + explained_variances=explained_variances, |
| 108 | + explained_variance_ratios=explained_variance_ratios, |
89 | 109 | )
|
90 | 110 |
|
| 111 | + np.testing.assert_equal(list(pca_gdf.columns), EXPECTED_DATA_GDF_COLUMNS) |
| 112 | + np.testing.assert_equal(pca_gdf.shape, DATA_GDF.shape) |
| 113 | + |
91 | 114 |
|
92 | 115 | @pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
|
93 | 116 | def test_pca_with_nan_removal():
|
94 | 117 | """Test that PCA function gives correct output for Numpy array input that has NaN values and remove strategy."""
|
95 |
| - data = np.array([[1, 1], [2, np.nan], [3, 3]]) |
96 | 118 | pca_array, principal_components, explained_variances, explained_variance_ratios = compute_pca(
|
97 |
| - data, 2, nodata_handling="remove" |
| 119 | + DATA_WITH_NAN, 2, nodata_handling="remove" |
98 | 120 | )
|
99 | 121 |
|
100 | 122 | expected_pca_values = np.array([[-1.414, 0.0], [np.nan, np.nan], [1.414, 0.0]])
|
101 |
| - expected_component_values = np.array([[0.70711, 0.70711], [-0.70711, 0.70711]]) |
102 | 123 | expected_explained_variance_ratios_values = [1.0, 0.0]
|
103 | 124 |
|
104 |
| - np.testing.assert_equal(principal_components.size, 4) |
105 |
| - np.testing.assert_equal(explained_variances.size, 2) |
106 |
| - np.testing.assert_equal(explained_variance_ratios.size, 2) |
107 |
| - np.testing.assert_equal(pca_array.shape, DATA.shape) |
108 |
| - |
109 |
| - np.testing.assert_array_almost_equal(pca_array, expected_pca_values, decimal=3) |
110 |
| - np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=3) |
111 |
| - np.testing.assert_array_almost_equal( |
112 |
| - explained_variance_ratios, expected_explained_variance_ratios_values, decimal=3 |
| 125 | + _assert_expected_values( |
| 126 | + pca_array=pca_array, |
| 127 | + principal_components=principal_components, |
| 128 | + explained_variances=explained_variances, |
| 129 | + explained_variance_ratios=explained_variance_ratios, |
| 130 | + expected_pca_values=expected_pca_values, |
| 131 | + # Original implementation expected order as defined by EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE |
| 132 | + expected_component_values=EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE, |
| 133 | + expected_component_values_alternative=EXPECTED_DATA_COMPONENT_VALUES, |
| 134 | + expected_explained_variance_ratios_values=expected_explained_variance_ratios_values, |
| 135 | + decimal_accuracy=3, |
| 136 | + data_shape=DATA_WITH_NAN.shape, |
113 | 137 | )
|
114 | 138 |
|
115 | 139 |
|
116 | 140 | @pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
|
117 | 141 | def test_pca_with_nan_replace():
|
118 | 142 | """Test that PCA function gives correct output for Numpy array input that has NaN values and replace strategy."""
|
119 |
| - data = np.array([[1, 1], [2, np.nan], [3, 3]]) |
120 | 143 | pca_array, principal_components, explained_variances, explained_variance_ratios = compute_pca(
|
121 |
| - data, 2, nodata_handling="replace" |
| 144 | + DATA_WITH_NAN, 2, nodata_handling="replace" |
122 | 145 | )
|
123 | 146 |
|
124 |
| - expected_pca_values = np.array([[-1.73205, 1.11022e-16], [0, 0], [1.73205, 1.11022e-16]]) |
125 |
| - expected_component_values = np.array([[0.707, 0.707], [0.707, -0.707]]) |
126 |
| - expected_explained_variance_ratios_values = [1.0, 4.10865e-33] |
127 |
| - |
128 |
| - np.testing.assert_equal(principal_components.size, 4) |
129 |
| - np.testing.assert_equal(explained_variances.size, 2) |
130 |
| - np.testing.assert_equal(explained_variance_ratios.size, 2) |
131 |
| - np.testing.assert_equal(pca_array.shape, DATA.shape) |
132 |
| - |
133 |
| - np.testing.assert_array_almost_equal(pca_array, expected_pca_values, decimal=3) |
134 |
| - np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=3) |
135 |
| - np.testing.assert_array_almost_equal( |
136 |
| - explained_variance_ratios, expected_explained_variance_ratios_values, decimal=3 |
| 147 | + _assert_expected_values( |
| 148 | + pca_array=pca_array, |
| 149 | + principal_components=principal_components, |
| 150 | + explained_variances=explained_variances, |
| 151 | + explained_variance_ratios=explained_variance_ratios, |
| 152 | + expected_pca_values=EXPECTED_DATA_PCA_VALUES, |
| 153 | + expected_component_values=EXPECTED_DATA_COMPONENT_VALUES, |
| 154 | + expected_component_values_alternative=EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE, |
| 155 | + expected_explained_variance_ratios_values=EXPECTED_DATA_EXPLAINED_VARIANCE_RATIOS_VALUES, |
| 156 | + decimal_accuracy=3, |
| 157 | + data_shape=DATA_WITH_NAN.shape, |
137 | 158 | )
|
138 | 159 |
|
139 | 160 |
|
140 | 161 | @pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
|
141 | 162 | def test_pca_with_nodata_removal():
|
142 | 163 | """Test that PCA function gives correct output for input that has specified nodata values and removal strategy."""
|
143 |
| - data = np.array([[1, 1], [2, -9999], [3, 3]]) |
144 | 164 | pca_array, principal_components, explained_variances, explained_variance_ratios = compute_pca(
|
145 |
| - data, 2, nodata_handling="remove", nodata=-9999 |
| 165 | + DATA_WITH_NODATA, 2, nodata_handling="remove", nodata=-9999 |
146 | 166 | )
|
147 | 167 |
|
148 | 168 | expected_pca_values = np.array([[-1.414, 0.0], [np.nan, np.nan], [1.414, 0.0]])
|
149 |
| - expected_component_values = np.array([[0.707, 0.707], [-0.707, 0.707]]) |
150 | 169 | expected_explained_variance_ratios_values = [1.0, 0.0]
|
151 | 170 |
|
152 |
| - np.testing.assert_equal(principal_components.size, 4) |
153 |
| - np.testing.assert_equal(explained_variances.size, 2) |
154 |
| - np.testing.assert_equal(explained_variance_ratios.size, 2) |
155 |
| - np.testing.assert_equal(pca_array.shape, DATA.shape) |
156 |
| - |
157 |
| - np.testing.assert_array_almost_equal(pca_array, expected_pca_values, decimal=3) |
158 |
| - np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=3) |
159 |
| - np.testing.assert_array_almost_equal( |
160 |
| - explained_variance_ratios, expected_explained_variance_ratios_values, decimal=3 |
| 171 | + _assert_expected_values( |
| 172 | + pca_array=pca_array, |
| 173 | + principal_components=principal_components, |
| 174 | + explained_variances=explained_variances, |
| 175 | + explained_variance_ratios=explained_variance_ratios, |
| 176 | + expected_pca_values=expected_pca_values, |
| 177 | + # Original implementation expected order as defined by EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE |
| 178 | + expected_component_values=EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE, |
| 179 | + expected_component_values_alternative=EXPECTED_DATA_COMPONENT_VALUES, |
| 180 | + expected_explained_variance_ratios_values=expected_explained_variance_ratios_values, |
| 181 | + decimal_accuracy=3, |
| 182 | + data_shape=DATA_WITH_NODATA.shape, |
161 | 183 | )
|
162 | 184 |
|
163 | 185 |
|
|
0 commit comments