Skip to content

Commit be23e7b

Browse files
MAINT: Update TSNE for sklearn1.8 (#2793)
* update tsne for sklearn1.8 * fix test * more corrections * more fixes for older sklearn * missing else * more fixes for older sklearn * correction * remove redundant check * more clear conditions
1 parent b1e9d90 commit be23e7b

File tree

4 files changed

+119
-97
lines changed

4 files changed

+119
-97
lines changed

daal4py/sklearn/manifold/_t_sne.py

Lines changed: 106 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -122,36 +122,89 @@ def _daal_tsne(self, P, n_samples, X_embedded):
122122

123123
return X_embedded
124124

125+
# Comment 2025-11-24: This appears to be a copy-paste from an earlier version of the original
126+
# scikit-learn with some modifications to make calls to oneDAL under a narrow subset of
127+
# allowed input parameters, copy-pastying the rest of the sklearn code when oneDAL is not
128+
# called. Note that the conditions checked here are out of synch with the latest sklearn by now.
129+
# An early 'is supported' check that offloads to stock sklearn was added later on, which results
130+
# in having a lot of dead code paths in this function that can be safely removed.
131+
# Note: this method is called from inside 'fit' from the base class in stock scikit-learn.
132+
# Hence, the offloading logic is different than in other classes, as falling back to 'fit'
133+
# from the base class would lead to a circular loop.
125134
def _fit(self, X, skip_num_points=0):
126135
"""Private function to fit the model using X as training data."""
127-
if isinstance(self.init, str) and self.init == "warn":
128-
warnings.warn(
129-
"The default initialization in TSNE will change "
130-
"from 'random' to 'pca' in 1.2.",
131-
FutureWarning,
132-
)
133-
self._init = "random"
136+
137+
_patching_status = PatchingConditionsChain("sklearn.manifold.TSNE._tsne")
138+
_patching_status.and_conditions(
139+
[
140+
(
141+
self.method == "barnes_hut",
142+
'Used t-SNE method is not "barnes_hut" which is the only supported.',
143+
),
144+
(self.n_components == 2, "Number of components != 2."),
145+
(self.verbose == 0, "Verbose mode is set."),
146+
(
147+
daal_check_version((2021, "P", 600)),
148+
"oneDAL version is lower than 2021.6.",
149+
),
150+
(
151+
not (
152+
isinstance(self.init, str) and self.init == "pca" and issparse(X)
153+
),
154+
"PCA initialization is not supported with sparse input matrices.",
155+
),
156+
# Note: these conditions below should result in errors, but stock scikit-learn
157+
# does not check for errors at this exact point. Hence, this offloads the erroring
158+
# out to the base class, wherever in the process they might be encountered.
159+
(
160+
np.isscalar(self.angle) and self.angle > 0.0 and self.angle < 1.0,
161+
"'angle' must be between 0.0 - 1.0",
162+
),
163+
(self.early_exaggeration >= 1.0, "early_exaggeration must be at least 1"),
164+
(
165+
(
166+
isinstance(self.init, str)
167+
and self.init
168+
in ["random", "pca"]
169+
+ (
170+
["warn"]
171+
if sklearn_check_version("1.0")
172+
and not sklearn_check_version("1.2")
173+
else []
174+
)
175+
)
176+
or isinstance(self.init, np.ndarray),
177+
"'init' must be 'exact', 'pca', or a numpy array.",
178+
),
179+
]
180+
)
181+
_dal_ready = _patching_status.get_status(logs=True)
182+
if not _dal_ready:
183+
return super()._fit(X, skip_num_points)
184+
185+
if sklearn_check_version("1.0") and not sklearn_check_version("1.2"):
186+
if isinstance(self.init, str) and self.init == "warn":
187+
warnings.warn(
188+
"The default initialization in TSNE will change "
189+
"from 'random' to 'pca' in 1.2.",
190+
FutureWarning,
191+
)
192+
self._init = "random"
193+
else:
194+
self._init = self.init
134195
else:
135196
self._init = self.init
136197

137-
if isinstance(self._init, str) and self._init == "pca" and issparse(X):
138-
raise TypeError(
139-
"PCA initialization is currently not supported "
140-
"with the sparse input matrix. Use "
141-
'init="random" instead.'
142-
)
143-
144-
if self.method not in ["barnes_hut", "exact"]:
145-
raise ValueError("'method' must be 'barnes_hut' or 'exact'")
146-
if self.angle < 0.0 or self.angle > 1.0:
147-
raise ValueError("'angle' must be between 0.0 - 1.0")
148-
if self.learning_rate == "warn":
149-
warnings.warn(
150-
"The default learning rate in TSNE will change "
151-
"from 200.0 to 'auto' in 1.2.",
152-
FutureWarning,
153-
)
154-
self._learning_rate = 200.0
198+
if sklearn_check_version("1.0") and not sklearn_check_version("1.2"):
199+
if self.learning_rate == "warn":
200+
warnings.warn(
201+
"The default learning rate in TSNE will change "
202+
"from 200.0 to 'auto' in 1.2.",
203+
FutureWarning,
204+
)
205+
self._learning_rate = 200.0
206+
else:
207+
self._learning_rate = self.learning_rate
155208
else:
156209
self._learning_rate = self.learning_rate
157210
if self._learning_rate == "auto":
@@ -227,28 +280,15 @@ def _fit(self, X, skip_num_points=0):
227280
"or provide the dense distance matrix."
228281
)
229282

230-
if self.method == "barnes_hut" and self.n_components > 3:
231-
raise ValueError(
232-
"'n_components' should be inferior to 4 for the "
233-
"barnes_hut algorithm as it relies on "
234-
"quad-tree or oct-tree."
235-
)
236283
random_state = check_random_state(self.random_state)
237284

238-
if self.early_exaggeration < 1.0:
239-
raise ValueError(
240-
"early_exaggeration must be at least 1, but is {}".format(
241-
self.early_exaggeration
242-
)
243-
)
244-
245285
if not sklearn_check_version("1.2"):
246286
if self.n_iter < 250:
247287
raise ValueError("n_iter should be at least 250")
248288

249289
n_samples = X.shape[0]
250290

251-
neighbors_nn = None
291+
# neighbors_nn = None # <- unused variable in stock sklearn, commented out due to coverity
252292
if self.method == "exact":
253293
# Retrieve the distance matrix, either using the precomputed one or
254294
# computing it.
@@ -278,9 +318,8 @@ def _fit(self, X, skip_num_points=0):
278318
"All distances should be positive, the " "metric given is not correct"
279319
)
280320

281-
if (
282-
self.metric != "euclidean"
283-
and getattr(self, "square_distances", True) is True
321+
if self.metric != "euclidean" and (
322+
sklearn_check_version("1.2") or self.square_distances is True
284323
):
285324
distances **= 2
286325

@@ -339,15 +378,14 @@ def _fit(self, X, skip_num_points=0):
339378
# Free the memory used by the ball_tree
340379
del knn
341380

342-
if (
343-
getattr(self, "square_distances", True) is True
344-
or self.metric == "euclidean"
381+
# knn return the euclidean distance but we need it squared
382+
# to be consistent with the 'exact' method. Note that the
383+
# the method was derived using the euclidean method as in the
384+
# input space. Not sure of the implication of using a different
385+
# metric.
386+
if sklearn_check_version("1.2") or (
387+
self.square_distances is True or self.metric == "euclidean"
345388
):
346-
# knn return the euclidean distance but we need it squared
347-
# to be consistent with the 'exact' method. Note that the
348-
# the method was derived using the euclidean method as in the
349-
# input space. Not sure of the implication of using a different
350-
# metric.
351389
distances_nn.data **= 2
352390

353391
# compute the joint probability distribution for the input space
@@ -358,16 +396,23 @@ def _fit(self, X, skip_num_points=0):
358396
elif self._init == "pca":
359397
pca = PCA(
360398
n_components=self.n_components,
361-
svd_solver="randomized",
362399
random_state=random_state,
363400
)
401+
if sklearn_check_version("1.2"):
402+
# Always output a numpy array, no matter what is configured globally
403+
pca.set_output(transform="default")
364404
X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
365-
warnings.warn(
366-
"The PCA initialization in TSNE will change to "
367-
"have the standard deviation of PC1 equal to 1e-4 "
368-
"in 1.2. This will ensure better convergence.",
369-
FutureWarning,
370-
)
405+
if sklearn_check_version("1.0") and not sklearn_check_version("1.2"):
406+
warnings.warn(
407+
"The PCA initialization in TSNE will change to "
408+
"have the standard deviation of PC1 equal to 1e-4 "
409+
"in 1.2. This will ensure better convergence.",
410+
FutureWarning,
411+
)
412+
if sklearn_check_version("1.2"):
413+
# PCA is rescaled so that PC1 has standard deviation 1e-4 which is
414+
# the default value for random initialization. See issue #18018.
415+
X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4
371416
elif self._init == "random":
372417
# The embedding is initialized with iid samples from Gaussians with
373418
# standard deviation 1e-4.
@@ -377,40 +422,11 @@ def _fit(self, X, skip_num_points=0):
377422
else:
378423
raise ValueError("'init' must be 'pca', 'random', or " "a numpy array")
379424

380-
# Degrees of freedom of the Student's t-distribution. The suggestion
381-
# degrees_of_freedom = n_components - 1 comes from
382-
# "Learning a Parametric Embedding by Preserving Local Structure"
383-
# Laurens van der Maaten, 2009.
384-
degrees_of_freedom = max(self.n_components - 1, 1)
385-
386-
_patching_status = PatchingConditionsChain("sklearn.manifold.TSNE._tsne")
387-
_patching_status.and_conditions(
388-
[
389-
(
390-
self.method == "barnes_hut",
391-
'Used t-SNE method is not "barnes_hut" which is the only supported.',
392-
),
393-
(self.n_components == 2, "Number of components != 2."),
394-
(self.verbose == 0, "Verbose mode is set."),
395-
(
396-
daal_check_version((2021, "P", 600)),
397-
"oneDAL version is lower than 2021.6.",
398-
),
399-
]
400-
)
401-
_dal_ready = _patching_status.get_status(logs=True)
425+
# Note: by this point, stock sklearn would calculate degrees of freedom, but oneDAL
426+
# doesn't use them.
402427

403-
if _dal_ready:
404-
X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64])
405-
return self._daal_tsne(P, n_samples, X_embedded=X_embedded)
406-
return self._tsne(
407-
P,
408-
degrees_of_freedom,
409-
n_samples,
410-
X_embedded=X_embedded,
411-
neighbors=neighbors_nn,
412-
skip_num_points=skip_num_points,
413-
)
428+
X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64])
429+
return self._daal_tsne(P, n_samples, X_embedded=X_embedded)
414430

415431
fit.__doc__ = BaseTSNE.fit.__doc__
416432
fit_transform.__doc__ = BaseTSNE.fit_transform.__doc__

doc/sources/algorithms.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,11 +190,11 @@ Dimensionality Reduction
190190
- All parameters are supported except:
191191

192192
- ``metric`` != 'euclidean' or `'minkowski'` with ``p`` != `2`
193-
194193
- ``n_components`` can only be `2`
195-
194+
- ``method`` != ``"barnes_hut"``
195+
196196
Refer to :ref:`TSNE acceleration details <acceleration_tsne>` to learn more.
197-
- Sparse data is not supported
197+
- Sparse data with ``init`` = ``"pca"`` is not supported
198198

199199
Nearest Neighbors
200200
*****************

doc/sources/guide/acceleration.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ The overall acceleration of TSNE depends on the acceleration of each of these al
3434
- ``metric`` != `'euclidean'` or `'minkowski'` with ``p`` != `2`
3535
- The Gradient Descent part of the algorithm supports all parameters except:
3636

37-
- ``n_components`` = `3`
38-
- ``method`` = `'exact'`
39-
- ``verbose`` != `0`
37+
- ``n_components`` > ``2``
38+
- ``method`` = ``'exact'``
39+
- ``verbose`` != ``0``
4040

4141
To get better performance, use parameters supported by both components.
4242

sklearnex/manifold/tests/test_tsne.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
from numpy.testing import assert_allclose
2020
from sklearn.metrics.pairwise import pairwise_distances
2121

22+
from daal4py.sklearn._utils import sklearn_check_version
23+
2224
# Note: n_components must be 2 for now
2325
from onedal.tests.utils._dataframes_support import (
2426
_as_numpy,
@@ -161,8 +163,12 @@ def test_tsne_functionality_and_edge_cases(
161163
assert np.any(embedding != 0)
162164

163165

166+
# Note: since sklearn1.2, the PCA initialization divides by standard deviations of components.
167+
# Since those will be zeros for constant data, it will end up producing NaNs, hence it's not tested.
164168
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
165-
@pytest.mark.parametrize("init", ["pca", "random"])
169+
@pytest.mark.parametrize(
170+
"init", ["random"] + (["pca"] if not sklearn_check_version("1.2") else [])
171+
)
166172
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
167173
def test_tsne_constant_data(init, dataframe, queue, dtype):
168174
from sklearnex.manifold import TSNE

0 commit comments

Comments
 (0)