diff --git a/aeon/clustering/_clarans.py b/aeon/clustering/_clarans.py index 2ee497366b..6d2d8298f6 100644 --- a/aeon/clustering/_clarans.py +++ b/aeon/clustering/_clarans.py @@ -128,7 +128,7 @@ def _fit_one_init(self, X: np.ndarray, max_neighbours: int): j = 0 X_indexes = np.arange(X.shape[0], dtype=int) if isinstance(self._init, Callable): - best_medoids = self._init(X) + best_medoids = self._init(X=X) else: best_medoids = self._init best_non_medoids = np.setdiff1d(X_indexes, best_medoids) diff --git a/aeon/clustering/_cluster_initialisation.py b/aeon/clustering/_cluster_initialisation.py new file mode 100644 index 0000000000..d950eed5cc --- /dev/null +++ b/aeon/clustering/_cluster_initialisation.py @@ -0,0 +1,437 @@ +"""Initialisation strategies for clustering. + +This file contains the various initialisation algorithms that can be used +with clustering algorithms. + +The functions with "indexes" in their names return the indexes of the +initialised clusters, while the functions with "centers" return the +actual centers. +""" + +from collections.abc import Callable +from functools import partial + +import numpy as np +from numpy.random import RandomState + +from aeon.distances import pairwise_distance + + +def _random_center_initialiser_indexes( + *, X: np.ndarray, n_clusters: int, random_state: RandomState +) -> np.ndarray: + return random_state.choice(X.shape[0], n_clusters, replace=False) + + +def _random_center_initialiser( + *, X: np.ndarray, n_clusters: int, random_state: RandomState +) -> np.ndarray: + return X[ + _random_center_initialiser_indexes( + X=X, n_clusters=n_clusters, random_state=random_state + ) + ] + + +def _first_center_initialiser_indexes( + *, X: np.ndarray, n_clusters: int, random_state: RandomState, **kwargs +) -> np.ndarray: + return np.arange(n_clusters) + + +def _first_center_initialiser( + *, X: np.ndarray, n_clusters: int, random_state: RandomState +) -> np.ndarray: + return X[ + _first_center_initialiser_indexes( + X=X, n_clusters=n_clusters, random_state=random_state + ) + ] + + +def _random_values_center_initialiser( + *, X: np.ndarray, n_clusters: int, random_state: RandomState, **kwargs +): + return random_state.rand(n_clusters, X.shape[-2], X.shape[-1]) + + +def _kmeans_plus_plus_center_initialiser_indexes( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int = 1, + return_distance_and_labels: bool = False, + **kwargs, +) -> np.ndarray: + n_samples = X.shape[0] + initial_center_idx = random_state.randint(n_samples) + indexes = [initial_center_idx] + + min_distances = pairwise_distance( + X, + X[[initial_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + labels = np.zeros(n_samples, dtype=int) + + for i in range(1, n_clusters): + d = min_distances.copy() + chosen = np.asarray(indexes, dtype=int) + finite_mask = np.isfinite(d) + if not np.any(finite_mask): + candidates = np.setdiff1d(np.arange(n_samples), chosen, assume_unique=False) + next_center_idx = random_state.choice(candidates) + indexes.append(next_center_idx) + + new_distances = pairwise_distance( + X, + X[[next_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + closer_points = new_distances < min_distances + min_distances[closer_points] = new_distances[closer_points] + labels[closer_points] = i + continue + + min_val = d[finite_mask].min() + w = d - min_val + w[~np.isfinite(w)] = 0.0 + w = np.clip(w, 0.0, None) + w[chosen] = 0.0 + + total = w.sum() + if total <= 0.0: + candidates = np.setdiff1d(np.arange(n_samples), chosen, assume_unique=False) + next_center_idx = random_state.choice(candidates) + else: + p = w / total + p = np.clip(p, 0.0, None) + p_sum = p.sum() + if p_sum <= 0.0: + candidates = np.setdiff1d( + np.arange(n_samples), chosen, assume_unique=False + ) + next_center_idx = random_state.choice(candidates) + else: + p = p / p_sum + next_center_idx = random_state.choice(n_samples, p=p) + + indexes.append(next_center_idx) + + new_distances = pairwise_distance( + X, + X[[next_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + closer_points = new_distances < min_distances + min_distances[closer_points] = new_distances[closer_points] + labels[closer_points] = i + + if return_distance_and_labels: + return np.array(indexes), labels, min_distances + else: + return np.array(indexes) + + +def _kmeans_plus_plus_center_initialiser( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int = 1, + return_distance_and_labels: bool = False, + **kwargs, +) -> np.ndarray: + indexes, labels, min_distances = _kmeans_plus_plus_center_initialiser_indexes( + X=X, + n_clusters=n_clusters, + random_state=random_state, + distance=distance, + distance_params=distance_params, + n_jobs=n_jobs, + return_distance_and_labels=True, + ) + if return_distance_and_labels: + return X[indexes], labels, min_distances + return X[indexes] + + +def _kmedoids_plus_plus_center_initialiser_indexes( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int = 1, + **kwargs, +) -> np.ndarray: + """K-medoids++ initialisation that returns indexes. + + This uses a k-means++-style seeding procedure, but with medoids, + and supports potentially negative distances by shifting the + distance distribution to be non-negative. + """ + n_samples = X.shape[0] + initial_center_idx = random_state.randint(n_samples) + indexes = [initial_center_idx] + + # Initial minimum distances to the first medoid + min_distances = pairwise_distance( + X, + X[[initial_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + for _ in range(1, n_clusters): + d = min_distances.copy() + chosen = np.asarray(indexes, dtype=int) + + finite_mask = np.isfinite(d) + if not np.any(finite_mask): + candidates = np.setdiff1d(np.arange(n_samples), chosen, assume_unique=False) + next_center_idx = random_state.choice(candidates) + else: + min_val = d[finite_mask].min() + w = d - min_val + + w[~np.isfinite(w)] = 0.0 + + w = np.clip(w, 0.0, None) + + w[chosen] = 0.0 + + total = w.sum() + if total <= 0.0: + candidates = np.setdiff1d( + np.arange(n_samples), chosen, assume_unique=False + ) + next_center_idx = random_state.choice(candidates) + else: + p = w / total + p = np.clip(p, 0.0, None) + p_sum = p.sum() + if p_sum <= 0.0: + candidates = np.setdiff1d( + np.arange(n_samples), chosen, assume_unique=False + ) + next_center_idx = random_state.choice(candidates) + else: + p = p / p_sum + next_center_idx = random_state.choice(n_samples, p=p) + + indexes.append(next_center_idx) + + new_distances = pairwise_distance( + X, + X[[next_center_idx]], + method=distance, + n_jobs=n_jobs, + **distance_params, + ).reshape(n_samples) + + closer_points = new_distances < min_distances + min_distances[closer_points] = new_distances[closer_points] + + return np.array(indexes) + + +def _kmedoids_plus_plus_center_initialiser( + *, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable, + distance_params: dict, + n_jobs: int = 1, + **kwargs, +) -> np.ndarray: + """K-medoids++ initialisation that returns centers.""" + indexes = _kmedoids_plus_plus_center_initialiser_indexes( + X=X, + n_clusters=n_clusters, + random_state=random_state, + distance=distance, + distance_params=distance_params, + n_jobs=n_jobs, + ) + return X[indexes] + + +def resolve_center_initialiser( + init: str | np.ndarray, + X: np.ndarray, + n_clusters: int, + random_state: RandomState, + distance: str | Callable | None = None, + distance_params: dict | None = None, + n_jobs: int = 1, + custom_init_handlers: dict | None = None, + use_indexes: bool = False, +) -> Callable | np.ndarray: + """Resolve the center initialiser function or array from init parameter. + + Parameters + ---------- + X : 3D np.ndarray + Input data, any number of channels, equal length series of shape ``( + n_cases, n_channels, n_timepoints)`` + or 2D np.array (univariate, equal length series) of shape + ``(n_cases, n_timepoints)`` + or list of numpy arrays (any number of channels, unequal length series) + of shape ``[n_cases]``, 2D np.array ``(n_channels, n_timepoints_i)``, + where ``n_timepoints_i`` is length of series ``i``. Other types are + allowed and converted into one of the above. + n_clusters : int + The number of clusters to form as well as the number of centroids to generate. + random_state : RandomState + If `np.random.RandomState` instance, + distance : str or callable, optional + Distance method to compute similarity between time series. A list of valid + strings for measures can be found in the documentation for + :func:`aeon.distances.get_distance_function`. If a callable is passed it must be + a function that takes two 2d numpy arrays as input and returns a float. + distance_params : dict, default=None + Dictionary containing kwargs for the distance being used. For example if you + wanted to specify a window for DTW you would pass + distance_params={"window": 0.2}. See documentation of aeon.distances for more + details. + n_jobs : int, default=1 + The number of jobs to run in parallel. If -1, then the number of jobs is set + to the number of CPU cores. If 1, then the function is executed in a single + thread. If greater than 1, then the function is executed in parallel. + custom_init_handlers : dict, default=None + A dictionary of custom initialisation functions that can be used to initialise. + use_indexes : bool, default=False + Boolean when True initialisation that return indexes is returned, when false + return initialisation that returns the centres + + Returns + ------- + Callable | np.ndarray + If a ndarray is specific as init then the validated ndarray is returned, + If a string is passed then the corresponding function is returned. + """ + initialisers_dict = ( + _CENTRE_INITIALISERS if not use_indexes else _CENTRE_INITIALISER_INDEXES + ) + valid_init_methods = ", ".join(sorted(initialisers_dict.keys())) + + if isinstance(init, str): + if custom_init_handlers and init in custom_init_handlers: + return custom_init_handlers[init] + + if init not in initialisers_dict: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"The following are a list of valid init algorithms " + f"strings: {valid_init_methods}. You can also pass a " + f"np.ndarray of appropriate shape." + ) + + initialiser_func = initialisers_dict[init] + + if init in ("kmeans++", "kmedoids++"): + if distance is None or distance_params is None: + raise ValueError( + f"distance and distance_params are required for {init} " + f"initialisation" + ) + return partial( + initialiser_func, + n_clusters=n_clusters, + random_state=random_state, + distance=distance, + distance_params=distance_params, + n_jobs=n_jobs, + ) + + if init == "random_values": + return partial( + initialiser_func, + n_clusters=n_clusters, + random_state=random_state, + ) + + return partial( + initialiser_func, + n_clusters=n_clusters, + random_state=random_state, + ) + elif isinstance(init, np.ndarray): + if len(init) != n_clusters: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected length {n_clusters}, got {len(init)}." + ) + + if use_indexes: + if init.ndim != 1: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected 1D array of shape ({n_clusters},), " + f"got {init.shape}." + ) + if not np.issubdtype(init.dtype, np.integer): + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected an array of integers, got {init.dtype}." + ) + if init.min() < 0 or init.max() >= X.shape[0]: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Values must be in the range [0, {X.shape[0]})." + ) + return init + else: + if init.ndim == 1: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected multi-dimensional array of shape " + f"({n_clusters}, {X.shape[1]}, {X.shape[2]}), " + f"got {init.shape}." + ) + if init.shape[1:] != X.shape[1:]: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected shape ({n_clusters}, {X.shape[1]}, " + f"{X.shape[2]}), got {init.shape}." + ) + return init.copy() + else: + raise ValueError( + f"The value provided for init: {init} is invalid. " + f"Expected a string or np.ndarray." + ) + + +_CENTRE_INITIALISERS = { + "random": _random_center_initialiser, + "first": _first_center_initialiser, + "random_values": _random_values_center_initialiser, + "kmeans++": _kmeans_plus_plus_center_initialiser, + "kmedoids++": _kmedoids_plus_plus_center_initialiser, +} + +_CENTRE_INITIALISER_INDEXES = { + "random": _random_center_initialiser_indexes, + "first": _first_center_initialiser_indexes, + "kmeans++": _kmeans_plus_plus_center_initialiser_indexes, + "kmedoids++": _kmedoids_plus_plus_center_initialiser_indexes, +} diff --git a/aeon/clustering/_elastic_som.py b/aeon/clustering/_elastic_som.py index 361979a6d8..40fc87135f 100644 --- a/aeon/clustering/_elastic_som.py +++ b/aeon/clustering/_elastic_som.py @@ -9,6 +9,9 @@ from numpy.random import RandomState from sklearn.utils.random import check_random_state +from aeon.clustering._cluster_initialisation import ( + resolve_center_initialiser, +) from aeon.clustering.base import BaseClusterer from aeon.distances import get_alignment_path_function, pairwise_distance @@ -203,7 +206,7 @@ def _fit(self, X, y=None): self._check_params(X) if isinstance(self._init, Callable): - weights = self._init(X) + weights = self._init(X=X) else: weights = self._init.copy() @@ -257,30 +260,22 @@ def _update_iteration(self, x, weights, decay_rate, num_iterations): def _check_params(self, X): self._random_state = check_random_state(self.random_state) - # random initialization - if isinstance(self.init, str): - if self.init == "random": - self._init = self._random_center_initializer - elif self.init == "kmeans++": - self._init = self._kmeans_plus_plus_center_initializer - elif self.init == "first": - self._init = self._first_center_initializer - else: - raise ValueError( - f"The value provided for init: {self.init} is " - f"invalid. The following are a list of valid init algorithms " - f"strings: random, kmedoids++, first" - ) + + if self.distance_params is None: + self._distance_params = {} else: - if isinstance(self.init, np.ndarray) and len(self.init) == self.n_clusters: - self._init = self.init.copy() - else: - raise ValueError( - f"The value provided for init: {self.init} is " - f"invalid. The following are a list of valid init algorithms " - f"strings: random, kmedoids++, first. You can also pass a" - f"np.ndarray of size (n_clusters, n_channels, n_timepoints)" - ) + self._distance_params = self.distance_params + + self._init = resolve_center_initialiser( + init=self.init, + X=X, + n_clusters=self.n_clusters, + random_state=self._random_state, + distance=self.distance, + distance_params=self._distance_params, + n_jobs=1, + use_indexes=False, + ) self._neuron_position = np.arange(self.n_clusters) @@ -331,11 +326,6 @@ def _check_params(self, X): else: self._alignment_path_callable = None - if self.distance_params is None: - self._distance_params = {} - else: - self._distance_params = self.distance_params - def _elastic_update(self, x, y, w): best_path, distance = self._alignment_path_callable( x, y, **self._distance_params @@ -362,28 +352,6 @@ def _elastic_update(self, x, y, w): return s3 - def _random_center_initializer(self, X: np.ndarray) -> np.ndarray: - return X[self._random_state.choice(X.shape[0], self.n_clusters, replace=False)] - - def _kmeans_plus_plus_center_initializer(self, X: np.ndarray): - initial_center_idx = self._random_state.randint(X.shape[0]) - indexes = [initial_center_idx] - - for _ in range(1, self.n_clusters): - pw_dist = pairwise_distance( - X, X[indexes], method=self.distance, **self._distance_params - ) - min_distances = pw_dist.min(axis=1) - probabilities = min_distances / min_distances.sum() - next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) - indexes.append(next_center_idx) - - centers = X[indexes] - return centers - - def _first_center_initializer(self, X: np.ndarray) -> np.ndarray: - return X[list(range(self.n_clusters))] - @classmethod def _get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. diff --git a/aeon/clustering/_k_means.py b/aeon/clustering/_k_means.py index 54ce0d7b95..78b40b55ec 100644 --- a/aeon/clustering/_k_means.py +++ b/aeon/clustering/_k_means.py @@ -9,6 +9,9 @@ from numpy.random import RandomState from sklearn.utils import check_random_state +from aeon.clustering._cluster_initialisation import ( + resolve_center_initialiser, +) from aeon.clustering.averaging import ( VALID_BA_DISTANCE_METHODS, elastic_barycenter_average, @@ -239,7 +242,7 @@ def _fit(self, X: np.ndarray, y=None): def _fit_one_init(self, X: np.ndarray) -> tuple: if isinstance(self._init, Callable): - cluster_centres = self._init(X) + cluster_centres = self._init(X=X) else: cluster_centres = self._init.copy() prev_inertia = np.inf @@ -299,36 +302,22 @@ def _check_params(self, X: np.ndarray) -> None: self._random_state = check_random_state(self.random_state) self._n_jobs = check_n_jobs(self.n_jobs) - _incorrect_init_str = ( - f"The value provided for init: {self.init} is " - f"invalid. The following are a list of valid init algorithms " - f"strings: random, kmeans++, first. You can also pass a " - f"np.ndarray of size (n_clusters, n_channels, n_timepoints)" - ) - - if isinstance(self.init, str): - if self.init == "random": - self._init = self._random_center_initializer - elif self.init == "kmeans++": - self._init = self._kmeans_plus_plus_center_initializer - elif self.init == "first": - self._init = self._first_center_initializer - else: - raise ValueError(_incorrect_init_str) - else: - if ( - isinstance(self.init, np.ndarray) - and len(self.init) == self.n_clusters - and self.init.shape[1:] == X.shape[1:] - ): - self._init = self.init.copy() - else: - raise ValueError(_incorrect_init_str) - + # Set up distance_params before init logic (needed for kmeans++ initializer) if self.distance_params is None: self._distance_params = {} else: self._distance_params = self.distance_params.copy() + + self._init = resolve_center_initialiser( + init=self.init, + X=X, + n_clusters=self.n_clusters, + random_state=self._random_state, + distance=self.distance, + distance_params=self._distance_params, + n_jobs=self._n_jobs, + use_indexes=False, + ) if self.average_params is None: self._average_params = {} else: @@ -375,32 +364,6 @@ def _check_params(self, X: np.ndarray) -> None: if isinstance(self.averaging_method, str) and self.averaging_method != "mean": self._average_params["n_jobs"] = self._n_jobs - def _random_center_initializer(self, X: np.ndarray) -> np.ndarray: - return X[self._random_state.choice(X.shape[0], self.n_clusters, replace=False)] - - def _first_center_initializer(self, X: np.ndarray) -> np.ndarray: - return X[list(range(self.n_clusters))] - - def _kmeans_plus_plus_center_initializer(self, X: np.ndarray): - initial_center_idx = self._random_state.randint(X.shape[0]) - indexes = [initial_center_idx] - - for _ in range(1, self.n_clusters): - pw_dist = pairwise_distance( - X, - X[indexes], - method=self.distance, - n_jobs=self._n_jobs, - **self._distance_params, - ) - min_distances = pw_dist.min(axis=1) - probabilities = min_distances / min_distances.sum() - next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) - indexes.append(next_center_idx) - - centers = X[indexes] - return centers - def _handle_empty_cluster( self, X: np.ndarray, diff --git a/aeon/clustering/_k_medoids.py b/aeon/clustering/_k_medoids.py index 1b9d02618c..0825e31cd1 100644 --- a/aeon/clustering/_k_medoids.py +++ b/aeon/clustering/_k_medoids.py @@ -11,6 +11,9 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.utils import check_random_state +from aeon.clustering._cluster_initialisation import ( + resolve_center_initialiser, +) from aeon.clustering.base import BaseClusterer from aeon.distances import get_distance_function, pairwise_distance @@ -235,7 +238,6 @@ def _compute_new_cluster_centers( return np.array(new_center_indexes) def _compute_distance(self, X: np.ndarray, first_index: int, second_index: int): - # Check cache if np.isfinite(self._distance_cache[first_index, second_index]): return self._distance_cache[first_index, second_index] if np.isfinite(self._distance_cache[second_index, first_index]): @@ -243,7 +245,6 @@ def _compute_distance(self, X: np.ndarray, first_index: int, second_index: int): dist = self._distance_callable( X[first_index], X[second_index], **self._distance_params ) - # Update cache self._distance_cache[first_index, second_index] = dist self._distance_cache[second_index, first_index] = dist return dist @@ -271,7 +272,7 @@ def _pam_fit(self, X: np.ndarray): n_cases = X.shape[0] if isinstance(self._init, Callable): - medoids_idxs = self._init(X) + medoids_idxs = self._init(X=X) else: medoids_idxs = self._init not_medoid_idxs = np.arange(n_cases, dtype=int) @@ -282,7 +283,6 @@ def _pam_fit(self, X: np.ndarray): not_medoid_idxs = np.delete(np.arange(n_cases, dtype=int), medoids_idxs) for i in range(self.max_iter): - # Initialize best cost change and the associated swap couple. old_medoid_idxs = np.copy(medoids_idxs) best_cost_change = self._compute_optimal_swaps( distance_matrix, @@ -293,7 +293,6 @@ def _pam_fit(self, X: np.ndarray): ) inertia = np.inf - # If one of the swap decrease the objective, return that swap. if best_cost_change is not None and best_cost_change[2] < 0: first, second, _ = best_cost_change medoids_idxs[medoids_idxs == first] = second @@ -393,7 +392,7 @@ def _compute_optimal_swaps( def _alternate_fit(self, X) -> tuple[np.ndarray, np.ndarray, float, int]: cluster_center_indexes = self._init if isinstance(self._init, Callable): - cluster_center_indexes = self._init(X) + cluster_center_indexes = self._init(X=X) old_inertia = np.inf old_indexes = None for i in range(self.max_iter): @@ -431,32 +430,22 @@ def _assign_clusters( def _check_params(self, X: np.ndarray) -> None: self._random_state = check_random_state(self.random_state) - _incorrect_init_str = ( - f"The value provided for init: {self.init} is " - f"invalid. The following are a list of valid init algorithms " - f"strings: random, kmedoids++, first, build. You can also pass a " - f"np.ndarray of size (n_clusters, n_channels, n_timepoints)" - ) - - if isinstance(self.init, str): - if self.init == "random": - self._init = self._random_center_initializer - elif self.init == "kmedoids++": - self._init = self._kmedoids_plus_plus_center_initializer - elif self.init == "first": - self._init = self._first_center_initializer - elif self.init == "build": - self._init = self._pam_build_center_initializer - else: - raise ValueError(_incorrect_init_str) - else: - if isinstance(self.init, np.ndarray) and len(self.init) == self.n_clusters: - self._init = self.init - else: - raise ValueError(_incorrect_init_str) - if self.distance_params is not None: self._distance_params = self.distance_params + else: + self._distance_params = {} + + self._init = resolve_center_initialiser( + init=self.init, + X=X, + n_clusters=self.n_clusters, + random_state=self._random_state, + distance=self.distance, + distance_params=self._distance_params, + n_jobs=1, + custom_init_handlers={"build": self._pam_build_center_initializer}, + use_indexes=True, + ) if self.n_clusters > X.shape[0]: raise ValueError( @@ -481,28 +470,6 @@ def _check_params(self, X: np.ndarray) -> None: stacklevel=1, ) - def _random_center_initializer(self, X: np.ndarray) -> np.ndarray: - return self._random_state.choice(X.shape[0], self.n_clusters, replace=False) - - def _first_center_initializer(self, _) -> np.ndarray: - return np.array(list(range(self.n_clusters))) - - def _kmedoids_plus_plus_center_initializer(self, X: np.ndarray): - initial_center_idx = self._random_state.randint(X.shape[0]) - indexes = [initial_center_idx] - - for _ in range(1, self.n_clusters): - pw_dist = pairwise_distance( - X, X[indexes], method=self.distance, **self._distance_params - ) - min_distances = pw_dist.min(axis=1) - probabilities = min_distances / min_distances.sum() - next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) - indexes.append(next_center_idx) - - centers = X[indexes] - return centers - def _pam_build_center_initializer( self, X: np.ndarray, diff --git a/aeon/clustering/_kasba.py b/aeon/clustering/_kasba.py index 63fdd53115..addbef5038 100644 --- a/aeon/clustering/_kasba.py +++ b/aeon/clustering/_kasba.py @@ -9,6 +9,7 @@ from numpy.random import RandomState from sklearn.utils import check_random_state +from aeon.clustering._cluster_initialisation import _kmeans_plus_plus_center_initialiser from aeon.clustering._k_means import EmptyClusterError from aeon.clustering.averaging import kasba_average from aeon.clustering.base import BaseClusterer @@ -142,8 +143,16 @@ def __init__( def _fit(self, X: np.ndarray, y=None): self._check_params(X) - cluster_centers, distances_to_centers, labels = self._elastic_kmeans_plus_plus( - X, + cluster_centers, labels, distances_to_centers = ( + _kmeans_plus_plus_center_initialiser( + X=X, + n_clusters=self.n_clusters, + random_state=self._random_state, + distance=self.distance, + distance_params=self._distance_params, + n_jobs=1, + return_distance_and_labels=True, + ) ) self.labels_, self.cluster_centers_, self.inertia_, self.n_iter_ = self._kasba( X, @@ -321,34 +330,6 @@ def _handle_empty_cluster( return labels, cluster_centers, distances_to_centers - def _elastic_kmeans_plus_plus( - self, - X, - ): - initial_center_idx = self._random_state.randint(X.shape[0]) - indexes = [initial_center_idx] - - min_distances = pairwise_distance( - X, X[initial_center_idx], method=self.distance, **self._distance_params - ).flatten() - labels = np.zeros(X.shape[0], dtype=int) - - for i in range(1, self.n_clusters): - probabilities = min_distances / min_distances.sum() - next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) - indexes.append(next_center_idx) - - new_distances = pairwise_distance( - X, X[next_center_idx], method=self.distance, **self._distance_params - ).flatten() - - closer_points = new_distances < min_distances - min_distances[closer_points] = new_distances[closer_points] - labels[closer_points] = i - - centers = X[indexes] - return centers, min_distances, labels - def _check_params(self, X: np.ndarray) -> None: self._random_state = check_random_state(self.random_state) diff --git a/aeon/clustering/tests/test_clarans.py b/aeon/clustering/tests/test_clarans.py index a1da285cf3..d6968d6ccd 100644 --- a/aeon/clustering/tests/test_clarans.py +++ b/aeon/clustering/tests/test_clarans.py @@ -2,12 +2,9 @@ import numpy as np from sklearn import metrics -from sklearn.utils import check_random_state from aeon.clustering._clarans import TimeSeriesCLARANS -from aeon.clustering.tests.test_k_medoids import check_value_in_every_cluster from aeon.datasets import load_basic_motions, load_gunpoint -from aeon.distances import euclidean_distance def test_clarans_uni(): @@ -95,45 +92,3 @@ def test_clara_multi(): assert isinstance(clarans.cluster_centers_, np.ndarray) for val in proba: assert np.count_nonzero(val == 1.0) == 1 - - -def test_medoids_init(): - """Test init algorithms.""" - X_train, _ = load_gunpoint(split="train") - X_train = X_train[:10] - - num_clusters = 8 - kmedoids = TimeSeriesCLARANS( - random_state=1, - n_init=1, - init="first", - distance="euclidean", - n_clusters=num_clusters, - ) - - kmedoids._random_state = check_random_state(kmedoids.random_state) - kmedoids._distance_cache = np.full((len(X_train), len(X_train)), np.inf) - kmedoids._distance_callable = euclidean_distance - first_medoids_result = kmedoids._first_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, first_medoids_result) - random_medoids_result = kmedoids._random_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, random_medoids_result) - kmedoids_plus_plus_medoids_result = kmedoids._kmedoids_plus_plus_center_initializer( - X_train - ) - check_value_in_every_cluster(num_clusters, kmedoids_plus_plus_medoids_result) - kmedoids_build_result = kmedoids._pam_build_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, kmedoids_build_result) - - # Test setting manual init centres - num_clusters = 8 - custom_init_centres = np.array([1, 2, 3, 4, 5, 6, 7, 8]) - kmedoids = TimeSeriesCLARANS( - random_state=1, - n_init=1, - init=custom_init_centres, - distance="euclidean", - n_clusters=num_clusters, - ) - kmedoids.fit(X_train) - assert np.array_equal(kmedoids.cluster_centers_, X_train[custom_init_centres]) diff --git a/aeon/clustering/tests/test_cluster_initialisation.py b/aeon/clustering/tests/test_cluster_initialisation.py new file mode 100644 index 0000000000..922e36d33d --- /dev/null +++ b/aeon/clustering/tests/test_cluster_initialisation.py @@ -0,0 +1,111 @@ +"""Tests for cluster initialisation functions.""" + +from collections.abc import Callable + +import numpy as np +import pytest +from numpy.random import RandomState + +from aeon.clustering._cluster_initialisation import ( + _CENTRE_INITIALISER_INDEXES, + _CENTRE_INITIALISERS, +) +from aeon.distances._distance import ELASTIC_DISTANCES, POINTWISE_DISTANCES +from aeon.testing.data_generation import make_example_3d_numpy + +NON_RANDOM_INIT = ["first"] + + +def _run_initialisation_test( + key: str, + init_func: Callable, + init_func_indexes: Callable = None, + init_func_params=None, +): + if init_func_params is None: + init_func_params = {} + + X = make_example_3d_numpy(10, 1, 10, random_state=1, return_y=False) + n_clusters = 3 + init_func_params = { + "X": X, + "n_clusters": n_clusters, + **init_func_params, + } + + values = init_func(**init_func_params, random_state=RandomState(1)) + + assert len(values) == n_clusters + assert values.shape[1:] == X.shape[1:] + + assert np.allclose( + values, init_func(**init_func_params, random_state=RandomState(1)) + ) + + if key not in NON_RANDOM_INIT: + diff_random_state_values = init_func( + **init_func_params, random_state=RandomState(2) + ) + assert not np.allclose(values, diff_random_state_values) + + if init_func_indexes: + indexes = init_func_indexes(**init_func_params, random_state=RandomState(1)) + value_from_indexes = X[indexes] + assert np.allclose(values, value_from_indexes) + + +@pytest.mark.parametrize("init_key", _CENTRE_INITIALISERS.keys()) +def test_center_initialisers(init_key): + """Test all center initialisers.""" + params = {} + if init_key == "kmeans++" or init_key == "kmedoids++": + params["distance"] = "euclidean" + params["distance_params"] = {} + + _run_initialisation_test( + key=init_key, + init_func=_CENTRE_INITIALISERS[init_key], + init_func_indexes=_CENTRE_INITIALISER_INDEXES.get(init_key, None), + init_func_params=params, + ) + + +@pytest.mark.parametrize("init_key", ["kmeans++", "kmedoids++"]) +@pytest.mark.parametrize("dist", POINTWISE_DISTANCES + ELASTIC_DISTANCES) +def test_distance_center_initialisers(init_key, dist): + """Test all center initialisers with distance.""" + params = { + "distance": dist, + "distance_params": {}, + } + _run_initialisation_test( + key=init_key, + init_func=_CENTRE_INITIALISERS[init_key], + init_func_indexes=_CENTRE_INITIALISER_INDEXES.get(init_key, None), + init_func_params=params, + ) + + +@pytest.mark.parametrize("init_key", ["kmeans++", "kmedoids++"]) +def test_distance_center_initialisers_params(init_key): + """Test all center initialisers with distance.""" + n_clusters = 3 + X = make_example_3d_numpy(50, 1, 10, random_state=1, return_y=False) + + init_func_no_window = _CENTRE_INITIALISERS[init_key]( + X=X, + n_clusters=n_clusters, + distance_params={}, + distance="soft_dtw", + random_state=RandomState(1), + ) + + init_func_window = _CENTRE_INITIALISERS[init_key]( + X=X, + n_clusters=n_clusters, + distance_params={"gamma": 0.00001}, + distance="soft_dtw", + random_state=RandomState(1), + ) + + assert not np.array_equal(init_func_no_window, init_func_window) diff --git a/aeon/clustering/tests/test_elastic_som.py b/aeon/clustering/tests/test_elastic_som.py index 5d5ef47630..530ba7116c 100644 --- a/aeon/clustering/tests/test_elastic_som.py +++ b/aeon/clustering/tests/test_elastic_som.py @@ -4,6 +4,7 @@ import pytest from aeon.clustering import ElasticSOM +from aeon.clustering._cluster_initialisation import _CENTRE_INITIALISERS from aeon.distances import dtw_distance, msm_alignment_path from aeon.distances._distance import ELASTIC_DISTANCES from aeon.testing.data_generation import make_example_3d_numpy @@ -37,26 +38,29 @@ def test_elastic_som_multivariate(): assert preds.shape == (10,) -def test_elastic_som_init(): +@pytest.mark.parametrize("init", list(_CENTRE_INITIALISERS.keys()) + ["ndarray"]) +def test_elastic_som_init(init): """Test ElasticSOM with a custom initialization.""" X = make_example_3d_numpy( n_cases=10, n_channels=5, n_timepoints=20, return_y=False, random_state=1 ) - labels = [] - for init in ["random", "kmeans++", "first"]: - clst = ElasticSOM(n_clusters=3, init=init, random_state=1, num_iterations=10) - clst.fit(X) - assert clst.labels_.shape == (10,) - assert clst.cluster_centers_.shape == (3, 5, 20) - labels.append(clst.labels_) + if init == "ndarray": + init = X[:3] - preds = clst.predict(X) - assert preds.shape == (10,) + clst = ElasticSOM(n_clusters=3, init=init, random_state=1, num_iterations=10) + clst.fit(X) + assert clst.labels_.shape == (10,) + assert clst.cluster_centers_.shape == (3, 5, 20) - # Check that the labels are different - assert not np.array_equal(labels[0], labels[1]) - assert not np.array_equal(labels[0], labels[2]) - assert not np.array_equal(labels[1], labels[2]) + preds = clst.predict(X) + assert preds.shape == (10,) + + +def test_elastic_som_init_invalid(): + """Test ElasticSOM with invalid initialization.""" + X = make_example_3d_numpy( + n_cases=10, n_channels=5, n_timepoints=20, return_y=False, random_state=1 + ) # Test invalid init with pytest.raises(ValueError): clst = ElasticSOM( @@ -64,18 +68,6 @@ def test_elastic_som_init(): ) clst.fit(X) - # Test custom ndarray init - clst = ElasticSOM(n_clusters=3, init=X[:3], random_state=1, num_iterations=10) - clst.fit(X) - assert clst.labels_.shape == (10,) - assert clst.cluster_centers_.shape == (3, 5, 20) - - # Last labels is for "first" init - assert np.array_equal(clst.labels_, labels[-1]) - - preds = clst.predict(X) - assert preds.shape == (10,) - # Test more ndarrays than clusters with pytest.raises(ValueError): clst = ElasticSOM(n_clusters=3, init=X[:4], random_state=1, num_iterations=10) diff --git a/aeon/clustering/tests/test_k_means.py b/aeon/clustering/tests/test_k_means.py index ace08a5530..67bafee9f7 100644 --- a/aeon/clustering/tests/test_k_means.py +++ b/aeon/clustering/tests/test_k_means.py @@ -7,6 +7,7 @@ from sklearn import metrics from aeon.clustering import TimeSeriesKMeans +from aeon.clustering._cluster_initialisation import _CENTRE_INITIALISERS from aeon.datasets import load_basic_motions from aeon.distances._distance import ELASTIC_DISTANCES from aeon.testing.data_generation import make_example_3d_numpy @@ -169,18 +170,18 @@ def test_k_mean_distances(distance): "random_state": 1, "n_init": 1, "n_clusters": 3, - "init": "kmeans++", + "init": "random", "distance": dist, "distance_params": {key: params[key]}, } # Univariate test with_param_kmeans = _run_kmeans_test( - kmeans_params=curr_params, n_cases=40, n_channels=1, n_timepoints=10 + kmeans_params=curr_params, n_cases=80, n_channels=1, n_timepoints=10 ) # Multivariate test _run_kmeans_test( - kmeans_params=curr_params, n_cases=40, n_channels=3, n_timepoints=10 + kmeans_params=curr_params, n_cases=80, n_channels=3, n_timepoints=10 ) if dist in ELASTIC_DISTANCES: @@ -194,13 +195,15 @@ def test_k_mean_distances(distance): continue default_param_kmeans = _run_kmeans_test( - kmeans_params=curr_params, n_cases=40, n_channels=1, n_timepoints=10 + kmeans_params=curr_params, n_cases=80, n_channels=1, n_timepoints=10 ) - # Test parameters passed through kmeans - assert not np.array_equal( - with_param_kmeans.cluster_centers_, default_param_kmeans.cluster_centers_ - ) + if not isinstance(dist, Callable): + # Test parameters passed through kmeans + assert not np.array_equal( + with_param_kmeans.cluster_centers_, + default_param_kmeans.cluster_centers_, + ) @pytest.mark.parametrize("distance", TEST_DISTANCES_WITH_FULL_ALIGNMENT_PATH) @@ -248,15 +251,11 @@ def test_k_mean_ba(distance, averaging_method): @pytest.mark.parametrize("distance", TEST_DISTANCE_WITH_CUSTOM_DISTANCE) -@pytest.mark.parametrize("init", ["random", "kmeans++", "first", "ndarray"]) +@pytest.mark.parametrize("init", list(_CENTRE_INITIALISERS.keys()) + ["ndarray"]) def test_k_mean_init(distance, init): """Test implementation of Kmeans.""" distance, params = distance - # Only kmeans++ needs test with different distances - if init != "kmeans++" and distance != "euclidean": - return - n_cases = 10 n_timepoints = 10 n_clusters = 4 @@ -289,7 +288,7 @@ def test_k_mean_init(distance, init): kmeans._check_params(X_train_uni) if isinstance(kmeans._init, Callable): - uni_init_vals = kmeans._init(X_train_uni) + uni_init_vals = kmeans._init(X=X_train_uni) else: uni_init_vals = kmeans._init @@ -324,7 +323,7 @@ def test_k_mean_init(distance, init): kmeans._check_params(X_train_multi) if isinstance(kmeans._init, Callable): - multi_init_vals = kmeans._init(X_train_multi) + multi_init_vals = kmeans._init(X=X_train_multi) else: multi_init_vals = kmeans._init @@ -408,6 +407,49 @@ def test_empty_cluster(): kmeans.fit(np.array([first, first, first, first, first])) +def test_center_initialisers(): + """Test that CENTER_INITIALISERS work correctly.""" + from numpy.random import RandomState + + X_train = make_example_3d_numpy( + n_cases=20, n_channels=1, n_timepoints=10, random_state=1, return_y=False + ) + n_clusters = 3 + random_state = RandomState(1) + + # Test all available initializers + for init_name, initialiser_func in _CENTRE_INITIALISERS.items(): + if init_name == "kmeans++" or init_name == "kmedoids++": + # kmeans++ and kmedoids++ needs additional parameters + centers = initialiser_func( + X=X_train, + n_clusters=n_clusters, + random_state=random_state, + distance="euclidean", + distance_params={}, + n_jobs=1, + ) + else: + # Other initializers only need basic parameters + centers = initialiser_func( + X=X_train, + n_clusters=n_clusters, + random_state=random_state, + ) + + # Verify output shape + # random_values returns (n_clusters, n_channels) - it generates random values + # not based on the input data structure + if init_name == "random_values": + assert centers.shape == (n_clusters, X_train.shape[1], X_train.shape[2]) + else: + assert centers.shape == (n_clusters, X_train.shape[1], X_train.shape[2]) + # Verify no duplicate centers + for i in range(n_clusters): + for j in range(i + 1, n_clusters): + assert not np.array_equal(centers[i], centers[j]) + + def test_invalid_params(): """Test invalid parameters for k-mean.""" uni_data = make_example_3d_numpy( diff --git a/aeon/clustering/tests/test_k_medoids.py b/aeon/clustering/tests/test_k_medoids.py index 0fea3ead19..6864a09465 100644 --- a/aeon/clustering/tests/test_k_medoids.py +++ b/aeon/clustering/tests/test_k_medoids.py @@ -1,12 +1,14 @@ """Tests for time series k-medoids.""" +from collections.abc import Callable + import numpy as np +import pytest from sklearn import metrics -from sklearn.utils import check_random_state +from aeon.clustering._cluster_initialisation import _CENTRE_INITIALISER_INDEXES from aeon.clustering._k_medoids import TimeSeriesKMedoids from aeon.datasets import load_basic_motions, load_gunpoint -from aeon.distances import euclidean_distance def test_kmedoids_uni(): @@ -159,47 +161,34 @@ def check_value_in_every_cluster(num_clusters, initial_medoids): assert original_length == len(set(initial_medoids)) -def test_medoids_init(): +@pytest.mark.parametrize("init", list(_CENTRE_INITIALISER_INDEXES.keys()) + ["indexes"]) +def test_medoids_init(init): """Test implementation of Kmedoids.""" X_train, _ = load_gunpoint(split="train") X_train = X_train[:10] - num_clusters = 8 - kmedoids = TimeSeriesKMedoids( - random_state=1, - n_init=1, - max_iter=5, - init="first", - distance="euclidean", - n_clusters=num_clusters, - ) - kmedoids._random_state = check_random_state(kmedoids.random_state) - kmedoids._distance_cache = np.full((len(X_train), len(X_train)), np.inf) - kmedoids._distance_callable = euclidean_distance - first_medoids_result = kmedoids._first_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, first_medoids_result) - random_medoids_result = kmedoids._random_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, random_medoids_result) - kmedoids_plus_plus_medoids_result = kmedoids._kmedoids_plus_plus_center_initializer( - X_train - ) - check_value_in_every_cluster(num_clusters, kmedoids_plus_plus_medoids_result) - kmedoids_build_result = kmedoids._pam_build_center_initializer(X_train) - check_value_in_every_cluster(num_clusters, kmedoids_build_result) + num_clusters = 3 - # Test setting manual init centres - num_clusters = 8 - custom_init_centres = np.array([1, 2, 3, 4, 5, 6, 7, 8]) + if init == "indexes": + # Generate random indexes + rng = np.random.RandomState(1) + init = rng.choice(X_train.shape[0], num_clusters, replace=False) + + # Test initializer kmedoids = TimeSeriesKMedoids( random_state=1, n_init=1, max_iter=5, - init=custom_init_centres, + init=init, distance="euclidean", n_clusters=num_clusters, ) - kmedoids.fit(X_train) - assert np.array_equal(kmedoids.cluster_centers_, X_train[custom_init_centres]) + kmedoids._check_params(X_train) + if isinstance(kmedoids._init, Callable): + medoids_result = kmedoids._init(X=X_train) + else: + medoids_result = kmedoids._init + check_value_in_every_cluster(num_clusters, medoids_result) def _get_model_centres(data, distance, method="pam", distance_params=None): @@ -230,3 +219,28 @@ def test_custom_distance_params(): data, distance="msm", distance_params={"window": 0.01} ) assert not np.array_equal(default_dist, custom_params_dist) + + +def test_medoids_init_invalid(): + """Test implementation of Kmedoids with invalid init.""" + X_train, _ = load_gunpoint(split="train") + X_train = X_train[:10] + num_clusters = 3 + + # Test float array + with pytest.raises(ValueError, match="Expected an array of integers"): + kmedoids = TimeSeriesKMedoids( + n_clusters=num_clusters, + init=np.array([0.5, 1.5, 2.5]), + random_state=1, + ) + kmedoids.fit(X_train) + + # Test out of bounds + with pytest.raises(ValueError, match="Values must be in the range"): + kmedoids = TimeSeriesKMedoids( + n_clusters=num_clusters, + init=np.array([0, 1, 100]), + random_state=1, + ) + kmedoids.fit(X_train)