From e454a44b0f8ebe629650b2858f1c7323a1f4f802 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Petitpierre?=
<38254995+RPetitpierre@users.noreply.github.com>
Date: Sat, 8 Feb 2025 19:25:05 +0100
Subject: [PATCH 1/2] Added fork note (quick fix for VonMisesFisherMixture)
---
README.md | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/README.md b/README.md
index 13041e4..a5eb9b3 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,7 @@
+> **_IMPORTANT NOTE:_** This fork implements a quick fix to use VonMisesFisherMixture.
+> Works with sklearn=1.6.1.
+> SphericalKMeans is not supported.
+>
# Clustering on the unit hypersphere in scikit-learn
From 4c9fb3b569162121210b792dde4476d39bec3014 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Petitpierre?=
<38254995+RPetitpierre@users.noreply.github.com>
Date: Sat, 8 Feb 2025 19:29:57 +0100
Subject: [PATCH 2/2] Quick fix of von_mises_fisher_mixture
Obtained by replicating locally the deprecated dependencies from sklearn=0.22
---
spherecluster/von_mises_fisher_mixture.py | 212 +++++++++++++++++++++-
1 file changed, 205 insertions(+), 7 deletions(-)
diff --git a/spherecluster/von_mises_fisher_mixture.py b/spherecluster/von_mises_fisher_mixture.py
index 53c4c67..1727350 100644
--- a/spherecluster/von_mises_fisher_mixture.py
+++ b/spherecluster/von_mises_fisher_mixture.py
@@ -8,7 +8,9 @@
from scipy.special import logsumexp
from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
-from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape
+
+# depecated k_means_ dependencies
+#from sklearn.cluster.k_means_ import _init_centroids, _tolerance, _validate_center_shape
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import normalize
from sklearn.utils import check_array, check_random_state, as_float_array
@@ -16,11 +18,207 @@
from sklearn.utils.validation import FLOAT_DTYPES
from sklearn.utils.validation import check_is_fitted
-from . import spherical_kmeans
+from sklearn.utils.sparsefuncs import mean_variance_axis
MAX_CONTENTRATION = 1e10
+def _tolerance(X, tol):
+ """Return a tolerance which is independent of the dataset"""
+ if sp.issparse(X):
+ variances = mean_variance_axis(X, axis=0)[1]
+ else:
+ variances = np.var(X, axis=0)
+ return np.mean(variances) * tol
+
+
+def _validate_center_shape(X, n_centers, centers):
+ """Check if centers is compatible with X and n_centers"""
+ if len(centers) != n_centers:
+ raise ValueError('The shape of the initial centers (%s) '
+ 'does not match the number of clusters %i'
+ % (centers.shape, n_centers))
+ if centers.shape[1] != X.shape[1]:
+ raise ValueError(
+ "The number of features of the initial centers %s "
+ "does not match the number of features of the data %s."
+ % (centers.shape[1], X.shape[1]))
+
+
+def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
+ """Init n_clusters seeds according to k-means++
+
+ Parameters
+ ----------
+ X : array or sparse matrix, shape (n_samples, n_features)
+ The data to pick seeds for. To avoid memory copy, the input data
+ should be double precision (dtype=np.float64).
+
+ n_clusters : integer
+ The number of seeds to choose
+
+ x_squared_norms : array, shape (n_samples,)
+ Squared Euclidean norm of each data point.
+
+ random_state : int, RandomState instance
+ The generator used to initialize the centers. Use an int to make the
+ randomness deterministic.
+ See :term:`Glossary `.
+
+ n_local_trials : integer, optional
+ The number of seeding trials for each center (except the first),
+ of which the one reducing inertia the most is greedily chosen.
+ Set to None to make the number of trials depend logarithmically
+ on the number of seeds (2+log(k)); this is the default.
+
+ Notes
+ -----
+ Selects initial cluster centers for k-mean clustering in a smart way
+ to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
+ "k-means++: the advantages of careful seeding". ACM-SIAM symposium
+ on Discrete algorithms. 2007
+
+ Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
+ which is the implementation used in the aforementioned paper.
+ """
+ n_samples, n_features = X.shape
+ centers = np.empty((n_clusters, n_features), dtype=X.dtype)
+
+ assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
+
+ # Set the number of local seeding trials if none is given
+ if n_local_trials is None:
+ # This is what Arthur/Vassilvitskii tried, but did not report
+ # specific results for other than mentioning in the conclusion
+ # that it helped.
+ n_local_trials = 2 + int(np.log(n_clusters))
+
+ # Pick first center randomly
+ center_id = random_state.randint(n_samples)
+ if sp.issparse(X):
+ centers[0] = X[center_id].toarray()
+ else:
+ centers[0] = X[center_id]
+
+ # Initialize list of closest distances and calculate current potential
+ closest_dist_sq = euclidean_distances(
+ centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms,
+ squared=True)
+ current_pot = closest_dist_sq.sum()
+
+ # Pick the remaining n_clusters-1 points
+ for c in range(1, n_clusters):
+ # Choose center candidates by sampling with probability proportional
+ # to the squared distance to the closest existing center
+ rand_vals = random_state.random_sample(n_local_trials) * current_pot
+ candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
+ rand_vals)
+ # XXX: numerical imprecision can result in a candidate_id out of range
+ np.clip(candidate_ids, None, closest_dist_sq.size - 1,
+ out=candidate_ids)
+
+ # Compute distances to center candidates
+ distance_to_candidates = euclidean_distances(
+ X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
+
+ # update closest distances squared and potential for each candidate
+ np.minimum(closest_dist_sq, distance_to_candidates,
+ out=distance_to_candidates)
+ candidates_pot = distance_to_candidates.sum(axis=1)
+
+ # Decide which candidate is the best
+ best_candidate = np.argmin(candidates_pot)
+ current_pot = candidates_pot[best_candidate]
+ closest_dist_sq = distance_to_candidates[best_candidate]
+ best_candidate = candidate_ids[best_candidate]
+
+ # Permanently add best center candidate found in local tries
+ if sp.issparse(X):
+ centers[c] = X[best_candidate].toarray()
+ else:
+ centers[c] = X[best_candidate]
+
+ return centers
+
+def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
+ init_size=None):
+ """Compute the initial centroids
+
+ Parameters
+ ----------
+
+ X : array, shape (n_samples, n_features)
+
+ k : int
+ number of centroids
+
+ init : {'k-means++', 'random' or ndarray or callable} optional
+ Method for initialization
+
+ random_state : int, RandomState instance or None (default)
+ Determines random number generation for centroid initialization. Use
+ an int to make the randomness deterministic.
+ See :term:`Glossary `.
+
+ x_squared_norms : array, shape (n_samples,), optional
+ Squared euclidean norm of each data point. Pass it if you have it at
+ hands already to avoid it being recomputed here. Default: None
+
+ init_size : int, optional
+ Number of samples to randomly sample for speeding up the
+ initialization (sometimes at the expense of accuracy): the
+ only algorithm is initialized by running a batch KMeans on a
+ random subset of the data. This needs to be larger than k.
+
+ Returns
+ -------
+ centers : array, shape(k, n_features)
+ """
+ random_state = check_random_state(random_state)
+ n_samples = X.shape[0]
+
+ if x_squared_norms is None:
+ x_squared_norms = row_norms(X, squared=True)
+
+ if init_size is not None and init_size < n_samples:
+ if init_size < k:
+ warnings.warn(
+ "init_size=%d should be larger than k=%d. "
+ "Setting it to 3*k" % (init_size, k),
+ RuntimeWarning, stacklevel=2)
+ init_size = 3 * k
+ init_indices = random_state.randint(0, n_samples, init_size)
+ X = X[init_indices]
+ x_squared_norms = x_squared_norms[init_indices]
+ n_samples = X.shape[0]
+ elif n_samples < k:
+ raise ValueError(
+ "n_samples=%d should be larger than k=%d" % (n_samples, k))
+
+ if isinstance(init, str) and init == 'k-means++':
+ centers = _k_init(X, k, random_state=random_state,
+ x_squared_norms=x_squared_norms)
+ elif isinstance(init, str) and init == 'random':
+ seeds = random_state.permutation(n_samples)[:k]
+ centers = X[seeds]
+ elif hasattr(init, '__array__'):
+ # ensure that the centers have the same dtype as X
+ # this is a requirement of fused types of cython
+ centers = np.array(init, dtype=X.dtype)
+ elif callable(init):
+ centers = init(X, k, random_state=random_state)
+ centers = np.asarray(centers, dtype=X.dtype)
+ else:
+ raise ValueError("the init parameter for the k-means should "
+ "be 'k-means++' or 'random' or an ndarray, "
+ "'%s' (type '%s') was passed." % (init, type(init)))
+
+ if sp.issparse(centers):
+ centers = centers.toarray()
+
+ _validate_center_shape(X, k, centers)
+ return centers
+
def _inertia_from_labels(X, centers, labels):
"""Compute inertia with cosine distance using known labels.
"""
@@ -205,11 +403,11 @@ def _init_unit_centers(X, n_clusters, random_state, init):
return centers
elif init == "spherical-k-means":
- labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd(
- X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++"
- )
-
- return centers
+ raise NotImplementedError("This option from the original spherecluster implementation is deprecated")
+ #labels, inertia, centers, iters = spherical_kmeans._spherical_kmeans_single_lloyd(
+ # X, n_clusters, x_squared_norms=np.ones((n_examples,)), init="k-means++"
+ #)
+ #return centers
elif init == "random":
centers = np.random.randn(n_clusters, n_features)