Skip to content

Commit 771efd4

Browse files
nialovnmaarnio
authored andcommitted
test(pca_test): relax nan and nodata tests
The results sometimes deviate in order (#454). The problematic tests were relaxed to expect this. Also refactored other test code to decrease code repetition.
1 parent 59bbda1 commit 771efd4

File tree

1 file changed

+114
-92
lines changed

1 file changed

+114
-92
lines changed

tests/exploratory_analyses/pca_test.py

+114-92
Original file line numberDiff line numberDiff line change
@@ -1,163 +1,185 @@
11
import sys
2-
from pathlib import Path
32

43
import geopandas as gpd
54
import numpy as np
65
import pandas as pd
76
import pytest
7+
from beartype.typing import Optional
88
from shapely.geometry import Point
99

1010
from eis_toolkit.exceptions import EmptyDataException, InvalidColumnException, InvalidParameterValueException
1111
from eis_toolkit.exploratory_analyses.pca import compute_pca
1212

13-
parent_dir = Path(__file__).parent
14-
MULTIBAND_RASTER_PATH = parent_dir.joinpath("../data/remote/small_raster_multiband.tif")
15-
1613
DATA = np.array([[1, 1], [2, 2], [3, 3]])
14+
EXPECTED_DATA_PCA_VALUES = expected_pca_values = np.array(
15+
[[-1.73205081, 1.11022302e-16], [0.0, 0.0], [1.73205081, 1.11022302e-16]]
16+
)
17+
EXPECTED_DATA_COMPONENT_VALUES = np.array([[0.70711, 0.70711], [0.70711, -0.70711]])
18+
EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE = np.array([[0.70711, 0.70711], [-0.70711, 0.70711]])
19+
EXPECTED_DATA_EXPLAINED_VARIANCE_RATIOS_VALUES = [1.0, 4.10865055e-33]
20+
21+
DATA_DF = pd.DataFrame(data=DATA, columns=["A", "B"])
22+
EXPECTED_DATA_DF_COLUMNS = ["principal_component_1", "principal_component_2"]
23+
24+
DATA_GDF = gpd.GeoDataFrame(
25+
data=DATA, columns=["A", "B"], geometry=[Point(1, 2), Point(2, 1), Point(3, 3)], crs="EPSG:4326"
26+
)
27+
EXPECTED_DATA_GDF_COLUMNS = ["principal_component_1", "principal_component_2", "geometry"]
28+
29+
DATA_WITH_NAN = np.array([[1, 1], [2, np.nan], [3, 3]])
30+
DATA_WITH_NODATA = np.array([[1, 1], [2, -9999], [3, 3]])
31+
32+
33+
def _assert_expected_values(
34+
pca_array: np.ndarray,
35+
principal_components,
36+
explained_variances,
37+
explained_variance_ratios,
38+
expected_pca_values=EXPECTED_DATA_PCA_VALUES,
39+
expected_component_values=EXPECTED_DATA_COMPONENT_VALUES,
40+
expected_component_values_alternative: Optional[np.ndarray] = None,
41+
expected_explained_variance_ratios_values=EXPECTED_DATA_EXPLAINED_VARIANCE_RATIOS_VALUES,
42+
decimal_accuracy: int = 5,
43+
data_shape=DATA.shape,
44+
):
45+
np.testing.assert_equal(principal_components.size, 4)
46+
np.testing.assert_equal(explained_variances.size, 2)
47+
np.testing.assert_equal(explained_variance_ratios.size, 2)
48+
np.testing.assert_equal(pca_array.shape, data_shape)
49+
np.testing.assert_array_almost_equal(pca_array, expected_pca_values, decimal=decimal_accuracy)
50+
51+
try:
52+
np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=decimal_accuracy)
53+
except AssertionError:
54+
# Both variations in the sign of the two last members of principal_components occurs
55+
# depending on environment in nan and nodata tests
56+
# Both are allowed for those
57+
if expected_component_values_alternative is None:
58+
# Deviations in order are not expected unless *_alternative array is passed as input
59+
raise
60+
np.testing.assert_array_almost_equal(
61+
principal_components, expected_component_values_alternative, decimal=decimal_accuracy
62+
)
63+
64+
np.testing.assert_array_almost_equal(
65+
explained_variance_ratios, expected_explained_variance_ratios_values, decimal=decimal_accuracy
66+
)
1767

1868

1969
@pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
2070
def test_pca_numpy_array():
2171
"""Test that PCA function gives correct output for Numpy array input."""
2272
pca_array, principal_components, explained_variances, explained_variance_ratios = compute_pca(DATA, 2)
2373

24-
expected_pca_array_values = np.array([[-1.73205081, 1.11022302e-16], [0.0, 0.0], [1.73205081, 1.11022302e-16]])
25-
expected_component_values = np.array([[0.70711, 0.70711], [0.70711, -0.70711]])
26-
expected_explained_variance_ratios_values = [1.0, 4.10865055e-33]
27-
28-
np.testing.assert_equal(principal_components.size, 4)
29-
np.testing.assert_equal(explained_variances.size, 2)
30-
np.testing.assert_equal(explained_variance_ratios.size, 2)
31-
np.testing.assert_equal(pca_array.shape, DATA.shape)
32-
33-
np.testing.assert_array_almost_equal(pca_array, expected_pca_array_values, decimal=5)
34-
np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=5)
35-
np.testing.assert_array_almost_equal(
36-
explained_variance_ratios, expected_explained_variance_ratios_values, decimal=5
74+
_assert_expected_values(
75+
pca_array=pca_array,
76+
principal_components=principal_components,
77+
explained_variances=explained_variances,
78+
explained_variance_ratios=explained_variance_ratios,
3779
)
3880

3981

4082
@pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
4183
def test_pca_df():
4284
"""Test that PCA function gives correct output for DF input."""
43-
data_df = pd.DataFrame(data=DATA, columns=["A", "B"])
4485

45-
pca_df, principal_components, explained_variances, explained_variance_ratios = compute_pca(data_df, 2)
86+
pca_df, principal_components, explained_variances, explained_variance_ratios = compute_pca(DATA_DF, 2)
4687

47-
expected_columns = ["principal_component_1", "principal_component_2"]
48-
expected_pca_values = np.array([[-1.73205081, 1.11022302e-16], [0.0, 0.0], [1.73205081, 1.11022302e-16]])
49-
expected_component_values = np.array([[0.70711, 0.70711], [0.70711, -0.70711]])
50-
expected_explained_variance_ratios_values = [1.0, 4.10865055e-33]
51-
52-
np.testing.assert_equal(principal_components.size, 4)
53-
np.testing.assert_equal(explained_variances.size, 2)
54-
np.testing.assert_equal(explained_variance_ratios.size, 2)
55-
np.testing.assert_equal(list(pca_df.columns), expected_columns)
56-
np.testing.assert_equal(pca_df.shape, data_df.shape)
57-
58-
np.testing.assert_array_almost_equal(pca_df.values, expected_pca_values, decimal=5)
59-
np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=5)
60-
np.testing.assert_array_almost_equal(
61-
explained_variance_ratios, expected_explained_variance_ratios_values, decimal=5
88+
_assert_expected_values(
89+
pca_array=pca_df.values,
90+
principal_components=principal_components,
91+
explained_variances=explained_variances,
92+
explained_variance_ratios=explained_variance_ratios,
6293
)
94+
np.testing.assert_equal(list(pca_df.columns), EXPECTED_DATA_DF_COLUMNS)
95+
np.testing.assert_equal(pca_df.shape, DATA_DF.shape)
6396

6497

6598
@pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
6699
def test_pca_gdf():
67100
"""Test that PCA function gives correct output for GDF input."""
68-
data_gdf = gpd.GeoDataFrame(
69-
data=DATA, columns=["A", "B"], geometry=[Point(1, 2), Point(2, 1), Point(3, 3)], crs="EPSG:4326"
70-
)
71-
72-
pca_gdf, principal_components, explained_variances, explained_variance_ratios = compute_pca(data_gdf, 2)
73101

74-
expected_columns = ["principal_component_1", "principal_component_2", "geometry"]
75-
expected_pca_values = np.array([[-1.73205081, 1.11022302e-16], [0.0, 0.0], [1.73205081, 1.11022302e-16]])
76-
expected_component_values = np.array([[0.70711, 0.70711], [0.70711, -0.70711]])
77-
expected_explained_variance_ratios_values = [1.0, 4.10865055e-33]
102+
pca_gdf, principal_components, explained_variances, explained_variance_ratios = compute_pca(DATA_GDF, 2)
78103

79-
np.testing.assert_equal(principal_components.size, 4)
80-
np.testing.assert_equal(explained_variances.size, 2)
81-
np.testing.assert_equal(explained_variance_ratios.size, 2)
82-
np.testing.assert_equal(list(pca_gdf.columns), expected_columns)
83-
np.testing.assert_equal(pca_gdf.shape, data_gdf.shape)
84-
85-
np.testing.assert_array_almost_equal(pca_gdf.drop(columns=["geometry"]).values, expected_pca_values, decimal=5)
86-
np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=5)
87-
np.testing.assert_array_almost_equal(
88-
explained_variance_ratios, expected_explained_variance_ratios_values, decimal=5
104+
_assert_expected_values(
105+
pca_array=pca_gdf.drop(columns=["geometry"]).values,
106+
principal_components=principal_components,
107+
explained_variances=explained_variances,
108+
explained_variance_ratios=explained_variance_ratios,
89109
)
90110

111+
np.testing.assert_equal(list(pca_gdf.columns), EXPECTED_DATA_GDF_COLUMNS)
112+
np.testing.assert_equal(pca_gdf.shape, DATA_GDF.shape)
113+
91114

92115
@pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
93116
def test_pca_with_nan_removal():
94117
"""Test that PCA function gives correct output for Numpy array input that has NaN values and remove strategy."""
95-
data = np.array([[1, 1], [2, np.nan], [3, 3]])
96118
pca_array, principal_components, explained_variances, explained_variance_ratios = compute_pca(
97-
data, 2, nodata_handling="remove"
119+
DATA_WITH_NAN, 2, nodata_handling="remove"
98120
)
99121

100122
expected_pca_values = np.array([[-1.414, 0.0], [np.nan, np.nan], [1.414, 0.0]])
101-
expected_component_values = np.array([[0.70711, 0.70711], [-0.70711, 0.70711]])
102123
expected_explained_variance_ratios_values = [1.0, 0.0]
103124

104-
np.testing.assert_equal(principal_components.size, 4)
105-
np.testing.assert_equal(explained_variances.size, 2)
106-
np.testing.assert_equal(explained_variance_ratios.size, 2)
107-
np.testing.assert_equal(pca_array.shape, DATA.shape)
108-
109-
np.testing.assert_array_almost_equal(pca_array, expected_pca_values, decimal=3)
110-
np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=3)
111-
np.testing.assert_array_almost_equal(
112-
explained_variance_ratios, expected_explained_variance_ratios_values, decimal=3
125+
_assert_expected_values(
126+
pca_array=pca_array,
127+
principal_components=principal_components,
128+
explained_variances=explained_variances,
129+
explained_variance_ratios=explained_variance_ratios,
130+
expected_pca_values=expected_pca_values,
131+
# Original implementation expected order as defined by EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE
132+
expected_component_values=EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE,
133+
expected_component_values_alternative=EXPECTED_DATA_COMPONENT_VALUES,
134+
expected_explained_variance_ratios_values=expected_explained_variance_ratios_values,
135+
decimal_accuracy=3,
136+
data_shape=DATA_WITH_NAN.shape,
113137
)
114138

115139

116140
@pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
117141
def test_pca_with_nan_replace():
118142
"""Test that PCA function gives correct output for Numpy array input that has NaN values and replace strategy."""
119-
data = np.array([[1, 1], [2, np.nan], [3, 3]])
120143
pca_array, principal_components, explained_variances, explained_variance_ratios = compute_pca(
121-
data, 2, nodata_handling="replace"
144+
DATA_WITH_NAN, 2, nodata_handling="replace"
122145
)
123146

124-
expected_pca_values = np.array([[-1.73205, 1.11022e-16], [0, 0], [1.73205, 1.11022e-16]])
125-
expected_component_values = np.array([[0.707, 0.707], [0.707, -0.707]])
126-
expected_explained_variance_ratios_values = [1.0, 4.10865e-33]
127-
128-
np.testing.assert_equal(principal_components.size, 4)
129-
np.testing.assert_equal(explained_variances.size, 2)
130-
np.testing.assert_equal(explained_variance_ratios.size, 2)
131-
np.testing.assert_equal(pca_array.shape, DATA.shape)
132-
133-
np.testing.assert_array_almost_equal(pca_array, expected_pca_values, decimal=3)
134-
np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=3)
135-
np.testing.assert_array_almost_equal(
136-
explained_variance_ratios, expected_explained_variance_ratios_values, decimal=3
147+
_assert_expected_values(
148+
pca_array=pca_array,
149+
principal_components=principal_components,
150+
explained_variances=explained_variances,
151+
explained_variance_ratios=explained_variance_ratios,
152+
expected_pca_values=EXPECTED_DATA_PCA_VALUES,
153+
expected_component_values=EXPECTED_DATA_COMPONENT_VALUES,
154+
expected_component_values_alternative=EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE,
155+
expected_explained_variance_ratios_values=EXPECTED_DATA_EXPLAINED_VARIANCE_RATIOS_VALUES,
156+
decimal_accuracy=3,
157+
data_shape=DATA_WITH_NAN.shape,
137158
)
138159

139160

140161
@pytest.mark.xfail(sys.platform == "win32", reason="Results deviate on Windows.", raises=AssertionError)
141162
def test_pca_with_nodata_removal():
142163
"""Test that PCA function gives correct output for input that has specified nodata values and removal strategy."""
143-
data = np.array([[1, 1], [2, -9999], [3, 3]])
144164
pca_array, principal_components, explained_variances, explained_variance_ratios = compute_pca(
145-
data, 2, nodata_handling="remove", nodata=-9999
165+
DATA_WITH_NODATA, 2, nodata_handling="remove", nodata=-9999
146166
)
147167

148168
expected_pca_values = np.array([[-1.414, 0.0], [np.nan, np.nan], [1.414, 0.0]])
149-
expected_component_values = np.array([[0.707, 0.707], [-0.707, 0.707]])
150169
expected_explained_variance_ratios_values = [1.0, 0.0]
151170

152-
np.testing.assert_equal(principal_components.size, 4)
153-
np.testing.assert_equal(explained_variances.size, 2)
154-
np.testing.assert_equal(explained_variance_ratios.size, 2)
155-
np.testing.assert_equal(pca_array.shape, DATA.shape)
156-
157-
np.testing.assert_array_almost_equal(pca_array, expected_pca_values, decimal=3)
158-
np.testing.assert_array_almost_equal(principal_components, expected_component_values, decimal=3)
159-
np.testing.assert_array_almost_equal(
160-
explained_variance_ratios, expected_explained_variance_ratios_values, decimal=3
171+
_assert_expected_values(
172+
pca_array=pca_array,
173+
principal_components=principal_components,
174+
explained_variances=explained_variances,
175+
explained_variance_ratios=explained_variance_ratios,
176+
expected_pca_values=expected_pca_values,
177+
# Original implementation expected order as defined by EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE
178+
expected_component_values=EXPECTED_DATA_COMPONENT_VALUES_ALTERNATIVE,
179+
expected_component_values_alternative=EXPECTED_DATA_COMPONENT_VALUES,
180+
expected_explained_variance_ratios_values=expected_explained_variance_ratios_values,
181+
decimal_accuracy=3,
182+
data_shape=DATA_WITH_NODATA.shape,
161183
)
162184

163185

0 commit comments

Comments
 (0)