Skip to content

Commit ccc2dad

Browse files
authored
Merge pull request #446 from GispoCoding/445-fixmodify-pca-tools
Fix and update PCA tools
2 parents dfd3bbd + 82db606 commit ccc2dad

File tree

3 files changed

+134
-65
lines changed

3 files changed

+134
-65
lines changed

eis_toolkit/cli.py

+33-19
Original file line numberDiff line numberDiff line change
@@ -701,7 +701,7 @@ def parallel_coordinates_cli(
701701
def compute_pca_raster_cli(
702702
input_rasters: INPUT_FILES_ARGUMENT,
703703
output_raster: OUTPUT_FILE_OPTION,
704-
number_of_components: int = typer.Option(),
704+
number_of_components: Optional[int] = None,
705705
# NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func
706706
nodata_handling: Annotated[NodataHandling, typer.Option(case_sensitive=False)] = NodataHandling.remove,
707707
# NOTE: Omitted nodata parameter. Should use raster nodata.
@@ -715,27 +715,34 @@ def compute_pca_raster_cli(
715715
stacked_array, profiles = read_and_stack_rasters(input_rasters, nodata_handling="convert_to_nan")
716716
typer.echo("Progress: 25%")
717717

718-
pca_array, variance_ratios = compute_pca(
718+
transformed_data, principal_components, variances, variance_ratios = compute_pca(
719719
data=stacked_array, number_of_components=number_of_components, nodata_handling=get_enum_values(nodata_handling)
720720
)
721721

722722
# Fill np.nan with nodata before writing data to raster
723-
pca_array[pca_array == np.nan] = -9999
723+
transformed_data[transformed_data == np.nan] = -9999
724724
out_profile = profiles[0]
725725
out_profile["nodata"] = -9999
726726

727727
# Update nr of bands
728-
out_profile["count"] = number_of_components
728+
out_profile["count"] = len(variances)
729729

730730
# Create dictionary from the variance ratios array
731-
variances_ratios_dict = {}
732-
for i, variance_ratio in enumerate(variance_ratios):
733-
name = "PC " + str(i) + " explained variance"
734-
variances_ratios_dict[name] = variance_ratio
735-
json_str = json.dumps(variances_ratios_dict)
731+
# variances_ratios_dict = {}
732+
# for i, variance_ratio in enumerate(variance_ratios):
733+
# name = "PC " + str(i) + " explained variance"
734+
# variances_ratios_dict[name] = variance_ratio
735+
# json_str = json.dumps(variances_ratios_dict)
736+
737+
out_dict = {
738+
"principal_components": np.round(principal_components, 4).tolist(),
739+
"explained_variances": np.round(variances, 4).tolist(),
740+
"explained_variance_ratios": np.round(variance_ratios, 4).tolist(),
741+
}
742+
json_str = json.dumps(out_dict)
736743

737744
with rasterio.open(output_raster, "w", **out_profile) as dst:
738-
dst.write(pca_array)
745+
dst.write(transformed_data)
739746

740747
typer.echo("Progress: 100%")
741748

@@ -748,7 +755,7 @@ def compute_pca_raster_cli(
748755
def compute_pca_vector_cli(
749756
input_vector: INPUT_FILE_OPTION,
750757
output_vector: OUTPUT_FILE_OPTION,
751-
number_of_components: int = typer.Option(),
758+
number_of_components: Optional[int] = None,
752759
columns: Annotated[List[str], typer.Option()] = None,
753760
# NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func
754761
nodata_handling: Annotated[NodataHandling, typer.Option(case_sensitive=False)] = NodataHandling.remove,
@@ -762,7 +769,7 @@ def compute_pca_vector_cli(
762769
gdf = gpd.read_file(input_vector)
763770
typer.echo("Progress: 25%")
764771

765-
pca_gdf, variance_ratios = compute_pca(
772+
transformed_data, principal_components, variances, variance_ratios = compute_pca(
766773
data=gdf,
767774
number_of_components=number_of_components,
768775
columns=columns,
@@ -771,13 +778,20 @@ def compute_pca_vector_cli(
771778
)
772779

773780
# Create dictionary from the variance ratios array
774-
variances_ratios_dict = {}
775-
for i, variance_ratio in enumerate(variance_ratios):
776-
name = "PC " + str(i) + " explained variance"
777-
variances_ratios_dict[name] = variance_ratio
778-
json_str = json.dumps(variances_ratios_dict)
779-
780-
pca_gdf.to_file(output_vector)
781+
# variances_ratios_dict = {}
782+
# for i, variance_ratio in enumerate(variance_ratios):
783+
# name = "PC " + str(i) + " explained variance"
784+
# variances_ratios_dict[name] = variance_ratio
785+
# json_str = json.dumps(variances_ratios_dict)
786+
787+
out_dict = {
788+
"principal_components": np.round(principal_components, 4).tolist(),
789+
"explained_variances": np.round(variances, 4).tolist(),
790+
"explained_variance_ratios": np.round(variance_ratios, 4).tolist(),
791+
}
792+
json_str = json.dumps(out_dict)
793+
794+
transformed_data.to_file(output_vector)
781795
typer.echo("Progress: 100%")
782796

783797
typer.echo(f"Results: {json_str}")

eis_toolkit/exploratory_analyses/pca.py

+39-26
Original file line numberDiff line numberDiff line change
@@ -59,28 +59,30 @@ def _handle_missing_values(
5959
@beartype
6060
def _compute_pca(
6161
feature_matrix: np.ndarray, number_of_components: int, scaler_type: str
62-
) -> Tuple[np.ndarray, np.ndarray]:
62+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
6363
scaler = SCALERS[scaler_type]()
6464
scaled_data = scaler.fit_transform(feature_matrix)
6565

6666
pca = PCA(n_components=number_of_components)
67-
principal_components = pca.fit_transform(scaled_data)
68-
explained_variances = pca.explained_variance_ratio_
67+
transformed_data = pca.fit_transform(scaled_data)
68+
principal_components = pca.components_
69+
explained_variances = pca.explained_variance_
70+
explained_variance_ratios = pca.explained_variance_ratio_
6971

70-
return principal_components, explained_variances
72+
return transformed_data, principal_components, explained_variances, explained_variance_ratios
7173

7274

7375
@beartype
7476
def compute_pca(
7577
data: Union[np.ndarray, pd.DataFrame, gpd.GeoDataFrame],
76-
number_of_components: int,
78+
number_of_components: Optional[int] = None,
7779
columns: Optional[Sequence[str]] = None,
7880
scaler_type: Literal["standard", "min_max", "robust"] = "standard",
7981
nodata_handling: Literal["remove", "replace"] = "remove",
8082
nodata: Optional[Number] = None,
81-
) -> Tuple[Union[np.ndarray, pd.DataFrame, gpd.GeoDataFrame], np.ndarray]:
83+
) -> Tuple[Union[np.ndarray, pd.DataFrame, gpd.GeoDataFrame], np.ndarray, np.ndarray, np.ndarray]:
8284
"""
83-
Compute defined number of principal components for numeric input data.
85+
Compute defined number of principal components for numeric input data and transform the data.
8486
8587
Before computation, data is scaled according to specified scaler and NaN values removed or replaced.
8688
Optionally, a nodata value can be given to handle similarly as NaN values.
@@ -93,7 +95,8 @@ def compute_pca(
9395
Args:
9496
data: Input data for PCA.
9597
number_of_components: The number of principal components to compute. Should be >= 1 and at most
96-
the number of numeric columns if input is (Geo)Dataframe.
98+
the number of features found in input data. If not defined, will be the same as number of
99+
features in data. Defaults to None.
97100
columns: Select columns used for the PCA. Other columns are excluded from PCA, but added back
98101
to the result Dataframe intact. Only relevant if input is (Geo)Dataframe. Defaults to None.
99102
scaler_type: Transform data according to a specified Sklearn scaler.
@@ -103,8 +106,8 @@ def compute_pca(
103106
nodata: Define a nodata value to remove. Defaults to None.
104107
105108
Returns:
106-
The computed principal components in corresponding format as the input data and the
107-
explained variance ratios for each component.
109+
The transformed data in same format as input data, computed principal components, explained variances
110+
and explained variance ratios for each component.
108111
109112
Raises:
110113
EmptyDataException: The input is empty.
@@ -116,7 +119,7 @@ def compute_pca(
116119
if scaler_type not in SCALERS:
117120
raise InvalidParameterValueException(f"Invalid scaler. Choose from: {list(SCALERS.keys())}")
118121

119-
if number_of_components < 1:
122+
if number_of_components is not None and number_of_components < 1:
120123
raise InvalidParameterValueException("The number of principal components should be >= 1.")
121124

122125
# Get feature matrix (Numpy array) from various input types
@@ -158,40 +161,50 @@ def compute_pca(
158161
feature_matrix = feature_matrix.astype(float)
159162
feature_matrix, nan_mask = _handle_missing_values(feature_matrix, nodata_handling, nodata)
160163

164+
# Default number of components to number of features in data if not defined
165+
if number_of_components is None:
166+
number_of_components = feature_matrix.shape[1]
167+
161168
if number_of_components > feature_matrix.shape[1]:
162-
raise InvalidParameterValueException("The number of principal components is too high for the given input data.")
169+
raise InvalidParameterValueException(
170+
"The number of principal components is too high for the given input data "
171+
+ f"({number_of_components} > {feature_matrix.shape[1]})."
172+
)
173+
163174
# Core PCA computation
164-
principal_components, explained_variances = _compute_pca(feature_matrix, number_of_components, scaler_type)
175+
transformed_data, principal_components, explained_variances, explained_variance_ratios = _compute_pca(
176+
feature_matrix, number_of_components, scaler_type
177+
)
165178

166179
if nodata_handling == "remove" and nan_mask is not None:
167-
principal_components_with_nans = np.full((nan_mask.size, principal_components.shape[1]), np.nan)
168-
principal_components_with_nans[~nan_mask, :] = principal_components
169-
principal_components = principal_components_with_nans
180+
transformed_data_with_nans = np.full((nan_mask.size, transformed_data.shape[1]), np.nan)
181+
transformed_data_with_nans[~nan_mask, :] = transformed_data
182+
transformed_data = transformed_data_with_nans
170183

171184
# Convert PCA output to proper format
172185
if isinstance(data, np.ndarray):
173186
if data.ndim == 3:
174-
result_data = principal_components.reshape(rows, cols, -1).transpose(2, 0, 1)
187+
transformed_data_out = transformed_data.reshape(rows, cols, -1).transpose(2, 0, 1)
175188
else:
176-
result_data = principal_components
189+
transformed_data_out = transformed_data
177190

178191
elif isinstance(data, pd.DataFrame):
179192
component_names = [f"principal_component_{i+1}" for i in range(number_of_components)]
180-
result_data = pd.DataFrame(data=principal_components, columns=component_names)
193+
transformed_data_out = pd.DataFrame(data=transformed_data, columns=component_names)
181194
if columns is not None:
182195
old_columns = [column for column in data.columns if column not in columns]
183196
for column in old_columns:
184-
result_data[column] = data[column]
197+
transformed_data_out[column] = data[column]
185198
if isinstance(data, gpd.GeoDataFrame):
186-
result_data = gpd.GeoDataFrame(result_data, geometry=geometries, crs=crs)
199+
transformed_data_out = gpd.GeoDataFrame(transformed_data_out, geometry=geometries, crs=crs)
187200

188-
return result_data, explained_variances
201+
return transformed_data_out, principal_components, explained_variances, explained_variance_ratios
189202

190203

191204
@beartype
192205
def plot_pca(
193206
pca_df: pd.DataFrame,
194-
explained_variances: Optional[np.ndarray] = None,
207+
explained_variance_ratios: Optional[np.ndarray] = None,
195208
color_column_name: Optional[str] = None,
196209
save_path: Optional[str] = None,
197210
) -> sns.PairGrid:
@@ -203,7 +216,7 @@ def plot_pca(
203216
204217
Args:
205218
pca_df: A DataFrame containing computed principal components.
206-
explained_variances: The explained variance ratios for each principal component. Used for labeling
219+
explained_variance_ratios: The explained variance ratios for each principal component. Used for labeling
207220
axes in the plot. Optional parameter. Defaults to None.
208221
color_column_name: Name of the column that will be used for color-coding data points. Typically a
209222
categorical variable in the original data. Optional parameter, no colors if not provided.
@@ -226,8 +239,8 @@ def plot_pca(
226239
pair_grid = sns.pairplot(filtered_df, hue=color_column_name)
227240

228241
# Add explained variances to axis labels if provided
229-
if explained_variances is not None:
230-
labels = [f"PC {i+1} ({var:.1f}%)" for i, var in enumerate(explained_variances * 100)]
242+
if explained_variance_ratios is not None:
243+
labels = [f"PC {i+1} ({var:.1f}%)" for i, var in enumerate(explained_variance_ratios * 100)]
231244
else:
232245
labels = [f"PC {i+1}" for i in range(len(pair_grid.axes))]
233246

0 commit comments

Comments
 (0)